Snap for 8730993 from 16f96a2cb9f5d2764be7d8fad4eac6ae7167cd92 to mainline-tzdata3-release Change-Id: I00bdb927224cddfb55142e7cea32eac744283e67

commit: e23e8bda0c147c900c336366117b8d28a28a7fee [log] [tgz]
author: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> Wed Jun 15 21:48:09 2022 +0000
committer: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> Wed Jun 15 21:48:09 2022 +0000
tree: c0726a51cec8ede53aa7796a05d2b75480ce0cd0
parent: 9c7eeb2dc2c0ea2af889c3dccdb8e9c4b4a7aeb3 [diff]
parent: 16f96a2cb9f5d2764be7d8fad4eac6ae7167cd92 [diff]
diff --git a/Android.bp b/Android.bp
index a2eaf6f..ab53382 100644
--- a/Android.bp
+++ b/Android.bp

@@ -370,5 +370,4 @@
     "cpu_ref",
     "script_api",
     "support",
-    "toolkit",
 ]

diff --git a/build_rs.py b/build_rs.py
index c2faa5b..0416109 100755
--- a/build_rs.py
+++ b/build_rs.py

@@ -56,17 +56,14 @@
 
 
 def build(out_dir):
-    if sys.platform == 'darwin':
-        products = ('aosp_arm',)
-    else:
-        products = (
-            'aosp_arm',
-            'aosp_arm64',
-            # 'aosp_mips',
-            # 'aosp_mips64',
-            'aosp_x86',
-            'aosp_x86_64',
-        )
+    products = (
+        'aosp_arm',
+        'aosp_arm64',
+        # 'aosp_mips',
+        # 'aosp_mips64',
+        'aosp_x86',
+        'aosp_x86_64',
+    )
     for product in products:
         build_product(out_dir, product)
 
@@ -81,19 +78,13 @@
     env['TARGET_BUILD_VARIANT'] = 'userdebug'
     env['TARGET_PRODUCT'] = product
 
-    if sys.platform == 'darwin':
-        targets = [
-            'llvm-rs-cc',
-            'bcc_compat',
-        ]
-    else:
-        targets = [
-            # PHONY target specified in frameworks/rs/Android.mk.
-            'rs-prebuilts-full',
-            # We have to explicitly specify the jar for JACK to build.
-            android_path('out/target/common/obj/JAVA_LIBRARIES/' +
-                'android-support-v8-renderscript_intermediates/classes.jar')
-        ]
+    targets = [
+        # PHONY target specified in frameworks/rs/Android.mk.
+        'rs-prebuilts-full',
+        # We have to explicitly specify the jar for JACK to build.
+        android_path('out/target/common/obj/JAVA_LIBRARIES/' +
+            'android-support-v8-renderscript_intermediates/classes.jar')
+    ]
     subprocess.check_call(
         ['build/soong/soong_ui.bash', '--make-mode'] + targets, cwd=android_path(), env=env)
 
@@ -122,8 +113,7 @@
 def install_toolchain(build_dir, install_dir, host):
     install_built_host_files(build_dir, install_dir, host)
     install_clang_headers(build_dir, install_dir, host)
-    if not host.startswith('darwin'):
-        install_built_device_files(build_dir, install_dir, host)
+    install_built_device_files(build_dir, install_dir, host)
     install_license_files(install_dir)
     # We need to package libwinpthread-1.dll for Windows. This is explicitly
     # linked whenever pthreads is used, and the build system doesn't allow

diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index d102488..ce30092 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp

@@ -109,15 +109,6 @@
 namespace android {
 namespace renderscript {
 
-// Convert vector to uchar4, clipping each value to 255.
-template <typename TI>
-static inline uchar4 convertClipped(TI amount) {
-    return uchar4 { static_cast<uchar>(amount.x > 255 ? 255 : amount.x),
-                    static_cast<uchar>(amount.y > 255 ? 255 : amount.y),
-                    static_cast<uchar>(amount.z > 255 ? 255 : amount.z),
-                    static_cast<uchar>(amount.w > 255 ? 255 : amount.w)};
-}
-
 void RsdCpuScriptIntrinsicBlend::kernel(const RsExpandKernelDriverInfo *info,
                                         uint32_t xstart, uint32_t xend,
                                         uint32_t outstep) {
@@ -129,11 +120,8 @@
 
 #if defined(ARCH_ARM_USE_INTRINSICS)
     if (gArchUseSIMD) {
-        if (rsdIntrinsicBlend_K(out, in, info->slot, 0, x2 - x1) >= 0) {
+        if (rsdIntrinsicBlend_K(out, in, info->slot, x1, x2) >= 0)
             return;
-        } else {
-            ALOGW("Intrinsic Blend failed to use SIMD for %d", info->slot);
-        }
     }
 #endif
     switch (info->slot) {
@@ -163,10 +151,10 @@
         }
     #endif
         for (;x1 < x2; x1++, out++, in++) {
-            ushort4 in_s = convert_ushort4(*in);
-            ushort4 out_s = convert_ushort4(*out);
-            in_s = in_s + ((out_s * (ushort4)(255 - in_s.w)) >> (ushort4)8);
-            *out = convertClipped(in_s);
+            short4 in_s = convert_short4(*in);
+            short4 out_s = convert_short4(*out);
+            in_s = in_s + ((out_s * (short4)(255 - in_s.w)) >> (short4)8);
+            *out = convert_uchar4(in_s);
         }
         break;
     case BLEND_DST_OVER:
@@ -182,10 +170,10 @@
         }
      #endif
         for (;x1 < x2; x1++, out++, in++) {
-            ushort4 in_s = convert_ushort4(*in);
-            ushort4 out_s = convert_ushort4(*out);
-            in_s = out_s + ((in_s * (ushort4)(255 - out_s.w)) >> (ushort4)8);
-            *out = convertClipped(in_s);
+            short4 in_s = convert_short4(*in);
+            short4 out_s = convert_short4(*out);
+            in_s = out_s + ((in_s * (short4)(255 - out_s.w)) >> (short4)8);
+            *out = convert_uchar4(in_s);
         }
         break;
     case BLEND_SRC_IN:
@@ -201,8 +189,8 @@
         }
     #endif
         for (;x1 < x2; x1++, out++, in++) {
-            ushort4 in_s = convert_ushort4(*in);
-            in_s = (in_s * out->w) >> (ushort4)8;
+            short4 in_s = convert_short4(*in);
+            in_s = (in_s * out->w) >> (short4)8;
             *out = convert_uchar4(in_s);
         }
         break;
@@ -273,14 +261,11 @@
         }
     #endif
         for (;x1 < x2; x1++, out++, in++) {
-            // The max value the operation could produce before the shift
-            // is 255 * 255 + 255 * (255 - 0) = 130050, or 0x1FC02.
-            // That value does not fit in a ushort, so we use uint.
-            uint4 in_s = convert_uint4(*in);
-            uint4 out_s = convert_uint4(*out);
+            short4 in_s = convert_short4(*in);
+            short4 out_s = convert_short4(*out);
             out_s.xyz = ((in_s.xyz * out_s.w) +
-              (out_s.xyz * ((uint3)255 - (uint3)in_s.w))) >> (uint3)8;
-            *out = convertClipped(out_s);
+              (out_s.xyz * ((short3)255 - (short3)in_s.w))) >> (short3)8;
+            *out = convert_uchar4(out_s);
         }
         break;
     case BLEND_DST_ATOP:
@@ -296,12 +281,12 @@
         }
      #endif
         for (;x1 < x2; x1++, out++, in++) {
-            uint4 in_s = convert_uint4(*in);
-            uint4 out_s = convert_uint4(*out);
+            short4 in_s = convert_short4(*in);
+            short4 out_s = convert_short4(*out);
             out_s.xyz = ((out_s.xyz * in_s.w) +
-              (in_s.xyz * ((uint3)255 - (uint3)out_s.w))) >> (uint3)8;
+              (in_s.xyz * ((short3)255 - (short3)out_s.w))) >> (short3)8;
             out_s.w = in_s.w;
-            *out = convertClipped(out_s);
+            *out = convert_uchar4(out_s);
         }
         break;
     case BLEND_XOR:

diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index 8afa2ed..8a3dd1a 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp

@@ -353,7 +353,7 @@
     const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
     const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
 
-    uchar4 *out = ((uchar4 *)info->outPtr[0]);
+    uchar4 *out = ((uchar4 *)info->outPtr[0]) + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -430,7 +430,7 @@
     const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
     const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
 
-    uchar2 *out = ((uchar2 *)info->outPtr[0]);
+    uchar2 *out = ((uchar2 *)info->outPtr[0]) + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -508,7 +508,7 @@
     const uchar *yp2 = pin + stride * ys2;
     const uchar *yp3 = pin + stride * ys3;
 
-    uchar *out = ((uchar *)info->outPtr[0]);
+    uchar *out = ((uchar *)info->outPtr[0]) + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -586,7 +586,7 @@
     const float4 *yp2 = (const float4 *)(pin + stride * ys2);
     const float4 *yp3 = (const float4 *)(pin + stride * ys3);
 
-    float4 *out = ((float4 *)info->outPtr[0]);
+    float4 *out = ((float4 *)info->outPtr[0]) + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -638,7 +638,7 @@
     const float2 *yp2 = (const float2 *)(pin + stride * ys2);
     const float2 *yp3 = (const float2 *)(pin + stride * ys3);
 
-    float2 *out = ((float2 *)info->outPtr[0]);
+    float2 *out = ((float2 *)info->outPtr[0]) + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -690,7 +690,7 @@
     const float *yp2 = (const float *)(pin + stride * ys2);
     const float *yp3 = (const float *)(pin + stride * ys3);
 
-    float *out = ((float *)info->outPtr[0]);
+    float *out = ((float *)info->outPtr[0]) + xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 

diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
index 1473336..b4a2b7c 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S

@@ -593,7 +593,7 @@
 ENTRY(rsdIntrinsicBlend_K)
     adrp    x5, blendtable
     add     x5, x5, :lo12:blendtable
-    cmp     w2, tablesize
+    cmp     w2, tablesize >> 1
     bhs     1f
     ldrsh   x6, [x5, w2, uxtw #1]
     add     x0, x0, w3, uxtw #2
@@ -615,3 +615,4 @@
 #define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
         BLEND_LIST(BLEND_X)
 #undef BLEND_X
+

diff --git a/script_api/Android.bp b/script_api/Android.bp
index 8046bd0..0f026bf 100644
--- a/script_api/Android.bp
+++ b/script_api/Android.bp

@@ -26,9 +26,3 @@
         never: true,
     },
 }
-
-filegroup {
-    name: "rs_script_api",
-    srcs: ["include/*.rsh"],
-    path: "include",
-}

diff --git a/tests/cpp_api/Android.bp b/tests/cpp_api/Android.bp
deleted file mode 100644
index 9c3632a..0000000
--- a/tests/cpp_api/Android.bp
+++ /dev/null

@@ -1,31 +0,0 @@
-// Copyright (C) 2021 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-cc_defaults {
-    name: "frameworks_rs_tests_cpp-api-defaults",
-    shared_libs: ["liblog"],
-    cflags: [
-        "-Werror",
-        "-Wall",
-        "-Wextra",
-    ],
-    header_libs: ["rs-headers"],
-}
-

diff --git a/tests/cpp_api/Android.mk b/tests/cpp_api/Android.mk
new file mode 100644
index 0000000..6145a3d
--- /dev/null
+++ b/tests/cpp_api/Android.mk

@@ -0,0 +1,3 @@
+LOCAL_PATH:=$(call my-dir)
+
+include $(call all-makefiles-under,$(LOCAL_PATH))

diff --git a/tests/cpp_api/common.mk b/tests/cpp_api/common.mk
new file mode 100644
index 0000000..9bad790
--- /dev/null
+++ b/tests/cpp_api/common.mk

@@ -0,0 +1,7 @@
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_CFLAGS += -Werror -Wall -Wextra
+LOCAL_LDFLAGS +=  -llog
+
+intermediates := $(call intermediates-dir-for,STATIC_LIBRARIES,libRS,TARGET,)
+LOCAL_C_INCLUDES += $(intermediates)

diff --git a/tests/cpp_api/cpp-globalguard/Android.bp b/tests/cpp_api/cpp-globalguard/Android.bp
deleted file mode 100644
index c0ac0f9..0000000
--- a/tests/cpp_api/cpp-globalguard/Android.bp
+++ /dev/null

@@ -1,34 +0,0 @@
-//
-// Copyright (C) 2021 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-cc_binary {
-    name: "rstest-cpp-globalguard",
-    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
-    sdk_version: "21",
-    stl: "c++_static",
-    srcs: [
-        "multiply.rscript",
-        "compute.cpp",
-    ],
-    static_libs: ["libRScpp_static"],
-}

diff --git a/tests/cpp_api/cpp-globalguard/Android.mk b/tests/cpp_api/cpp-globalguard/Android.mk
new file mode 100644
index 0000000..88a10c9
--- /dev/null
+++ b/tests/cpp_api/cpp-globalguard/Android.mk

@@ -0,0 +1,21 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE:= rstest-cpp-globalguard
+LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS:= notice
+
+LOCAL_SDK_VERSION := 21
+LOCAL_NDK_STL_VARIANT := c++_static
+
+LOCAL_SRC_FILES:= \
+	multiply.rscript \
+	compute.cpp
+
+LOCAL_STATIC_LIBRARIES := \
+	libRScpp_static
+
+LOCAL_LDFLAGS += -llog
+
+include frameworks/rs/tests/cpp_api/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/cpp_api/cppallocation/Android.bp b/tests/cpp_api/cppallocation/Android.bp
deleted file mode 100644
index 210969b..0000000
--- a/tests/cpp_api/cppallocation/Android.bp
+++ /dev/null

@@ -1,32 +0,0 @@
-//
-// Copyright (C) 2021 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-cc_binary {
-    name: "rstest-cppallocation",
-    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
-    sdk_version: "21",
-    stl: "c++_static",
-    srcs: [
-        "multiply.rscript",
-        "compute.cpp",
-    ],
-    static_libs: ["libRScpp_static"],
-}

diff --git a/tests/cpp_api/cppallocation/Android.mk b/tests/cpp_api/cppallocation/Android.mk
new file mode 100644
index 0000000..d2c7cbc
--- /dev/null
+++ b/tests/cpp_api/cppallocation/Android.mk

@@ -0,0 +1,21 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE:= rstest-cppallocation
+LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS:= notice
+
+LOCAL_SDK_VERSION := 21
+LOCAL_NDK_STL_VARIANT := c++_static
+
+LOCAL_SRC_FILES:= \
+	multiply.rscript \
+	compute.cpp
+
+LOCAL_STATIC_LIBRARIES := \
+	libRScpp_static
+
+LOCAL_LDFLAGS += -llog
+
+include frameworks/rs/tests/cpp_api/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/cpp_api/cppbasic-getpointer/Android.bp b/tests/cpp_api/cppbasic-getpointer/Android.bp
deleted file mode 100644
index 203a8c9..0000000
--- a/tests/cpp_api/cppbasic-getpointer/Android.bp
+++ /dev/null

@@ -1,32 +0,0 @@
-//
-// Copyright (C) 2021 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-cc_binary {
-    name: "rstest-compute-getpointer",
-    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
-    srcs: [
-        "mono.rscript",
-        "compute.cpp",
-    ],
-    shared_libs: ["libRScpp"],
-}

diff --git a/tests/cpp_api/cppbasic-getpointer/Android.mk b/tests/cpp_api/cppbasic-getpointer/Android.mk
new file mode 100644
index 0000000..963a3e4
--- /dev/null
+++ b/tests/cpp_api/cppbasic-getpointer/Android.mk

@@ -0,0 +1,18 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE:= rstest-compute-getpointer
+LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS:= notice
+
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
+
+LOCAL_SRC_FILES:= \
+	mono.rscript \
+	compute.cpp
+
+LOCAL_SHARED_LIBRARIES := \
+	libRScpp
+
+include frameworks/rs/tests/cpp_api/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/cpp_api/cppbasic-shared/Android.bp b/tests/cpp_api/cppbasic-shared/Android.bp
deleted file mode 100644
index 0b49d86..0000000
--- a/tests/cpp_api/cppbasic-shared/Android.bp
+++ /dev/null

@@ -1,32 +0,0 @@
-//
-// Copyright (C) 2021 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-cc_binary {
-    name: "rstest-compute-shared",
-    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
-    srcs: [
-        "mono.rscript",
-        "compute.cpp",
-    ],
-    shared_libs: ["libRScpp"],
-}

diff --git a/tests/cpp_api/cppbasic-shared/Android.mk b/tests/cpp_api/cppbasic-shared/Android.mk
new file mode 100644
index 0000000..ca91745
--- /dev/null
+++ b/tests/cpp_api/cppbasic-shared/Android.mk

@@ -0,0 +1,18 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE:= rstest-compute-shared
+LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS:= notice
+
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
+
+LOCAL_SRC_FILES:= \
+	mono.rscript \
+	compute.cpp
+
+LOCAL_SHARED_LIBRARIES := \
+	libRScpp
+
+include frameworks/rs/tests/cpp_api/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/cpp_api/cppbasic/Android.bp b/tests/cpp_api/cppbasic/Android.bp
deleted file mode 100644
index d6723ca..0000000
--- a/tests/cpp_api/cppbasic/Android.bp
+++ /dev/null

@@ -1,34 +0,0 @@
-//
-// Copyright (C) 2021 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-cc_binary {
-    name: "rstest-compute",
-    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
-    sdk_version: "21",
-    stl: "c++_static",
-    srcs: [
-        "mono.rscript",
-        "compute.cpp",
-    ],
-    static_libs: ["libRScpp_static"],
-}

diff --git a/tests/cpp_api/cppbasic/Android.mk b/tests/cpp_api/cppbasic/Android.mk
new file mode 100644
index 0000000..a1f090f
--- /dev/null
+++ b/tests/cpp_api/cppbasic/Android.mk

@@ -0,0 +1,19 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE:= rstest-compute
+LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS:= notice
+
+LOCAL_SDK_VERSION := 21
+LOCAL_NDK_STL_VARIANT := c++_static
+
+LOCAL_SRC_FILES:= \
+	mono.rscript \
+	compute.cpp
+
+LOCAL_STATIC_LIBRARIES := \
+	libRScpp_static
+
+include frameworks/rs/tests/cpp_api/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/cpp_api/cppf16/Android.bp b/tests/cpp_api/cppf16/Android.bp
deleted file mode 100644
index 4aaca31..0000000
--- a/tests/cpp_api/cppf16/Android.bp
+++ /dev/null

@@ -1,33 +0,0 @@
-//
-// Copyright (C) 2021 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-cc_binary {
-    name: "rstest-cppf16",
-    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
-    stl: "c++_static",
-    srcs: ["compute.cpp"],
-    static_libs: ["libRScpp_static"],
-    shared_libs: [
-        "libdl",
-    ],
-}

diff --git a/tests/cpp_api/cppf16/Android.mk b/tests/cpp_api/cppf16/Android.mk
new file mode 100644
index 0000000..eca91db
--- /dev/null
+++ b/tests/cpp_api/cppf16/Android.mk

@@ -0,0 +1,19 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE:= rstest-cppf16
+LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS:= notice
+
+LOCAL_NDK_STL_VARIANT := c++_static
+
+LOCAL_SRC_FILES:= \
+	compute.cpp
+
+LOCAL_STATIC_LIBRARIES := \
+	libRScpp_static
+
+LOCAL_LDFLAGS += -llog -ldl
+
+include frameworks/rs/tests/cpp_api/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/cpp_api/cppstrided/Android.bp b/tests/cpp_api/cppstrided/Android.bp
deleted file mode 100644
index df3fc93..0000000
--- a/tests/cpp_api/cppstrided/Android.bp
+++ /dev/null

@@ -1,34 +0,0 @@
-//
-// Copyright (C) 2021 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-cc_binary {
-    name: "rstest-cppstrided",
-    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
-    sdk_version: "21",
-    stl: "c++_static",
-    srcs: [
-        "multiply.rscript",
-        "compute.cpp",
-    ],
-    static_libs: ["libRScpp_static"],
-}

diff --git a/tests/cpp_api/cppstrided/Android.mk b/tests/cpp_api/cppstrided/Android.mk
new file mode 100644
index 0000000..e0e03b3
--- /dev/null
+++ b/tests/cpp_api/cppstrided/Android.mk

@@ -0,0 +1,19 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE:= rstest-cppstrided
+LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS:= notice
+
+LOCAL_SDK_VERSION := 21
+LOCAL_NDK_STL_VARIANT := c++_static
+
+LOCAL_SRC_FILES:= \
+	multiply.rscript \
+	compute.cpp
+
+LOCAL_STATIC_LIBRARIES := \
+	libRScpp_static
+
+include frameworks/rs/tests/cpp_api/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/cpp_api/latency/Android.bp b/tests/cpp_api/latency/Android.bp
deleted file mode 100644
index 3eaf1ee..0000000
--- a/tests/cpp_api/latency/Android.bp
+++ /dev/null

@@ -1,34 +0,0 @@
-//
-// Copyright (C) 2021 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-cc_binary {
-    name: "rstest-latency",
-    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
-    sdk_version: "21",
-    stl: "c++_static",
-    srcs: [
-        "latency.rscript",
-        "latency.cpp",
-    ],
-    static_libs: ["libRScpp_static"],
-}

diff --git a/tests/cpp_api/latency/Android.mk b/tests/cpp_api/latency/Android.mk
new file mode 100644
index 0000000..16557f4
--- /dev/null
+++ b/tests/cpp_api/latency/Android.mk

@@ -0,0 +1,19 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE:= rstest-latency
+LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS:= notice
+
+LOCAL_SDK_VERSION := 21
+LOCAL_NDK_STL_VARIANT := c++_static
+
+LOCAL_SRC_FILES:= \
+	latency.rscript \
+	latency.cpp
+
+LOCAL_STATIC_LIBRARIES := \
+	libRScpp_static
+
+include frameworks/rs/tests/cpp_api/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/cpp_api/typecheck/Android.bp b/tests/cpp_api/typecheck/Android.bp
deleted file mode 100644
index 793888c..0000000
--- a/tests/cpp_api/typecheck/Android.bp
+++ /dev/null

@@ -1,34 +0,0 @@
-//
-// Copyright (C) 2021 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-cc_binary {
-    name: "rstest-typecheck",
-    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
-    sdk_version: "21",
-    stl: "c++_static",
-    srcs: [
-        "kernels.rscript",
-        "typecheck.cpp",
-    ],
-    static_libs: ["libRScpp_static"],
-}

diff --git a/tests/cpp_api/typecheck/Android.mk b/tests/cpp_api/typecheck/Android.mk
new file mode 100644
index 0000000..82f92ea
--- /dev/null
+++ b/tests/cpp_api/typecheck/Android.mk

@@ -0,0 +1,19 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE:= rstest-typecheck
+LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS:= notice
+
+LOCAL_SDK_VERSION := 21
+LOCAL_NDK_STL_VARIANT := c++_static
+
+LOCAL_SRC_FILES:= \
+	kernels.rscript \
+	typecheck.cpp
+
+LOCAL_STATIC_LIBRARIES := \
+	libRScpp_static
+
+include frameworks/rs/tests/cpp_api/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/java_api/Balls/Android.bp b/tests/java_api/Balls/Android.bp
deleted file mode 100644
index e931d3e..0000000
--- a/tests/java_api/Balls/Android.bp
+++ /dev/null

@@ -1,63 +0,0 @@
-//
-// Copyright (C) 2008 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-// This variable is used to match the 'LOCAL_SDK_VERSION' field in the former Android.mk file.
-local_sdk_version = "14"
-// This variable is used to set the value of the '-target-api' option for the 'llvm-rs-cc' command.
-// Note: it may NOT always be the same as the 'local_sdk_version', due to the existing logic in the Make build system.
-// For the Android.mk to Android.bp conversion, it is recommend to run the build before and after
-// the conversion, to make sure the value of the '-target-api' option to be the same.
-target_api_level = local_sdk_version
-
-android_test {
-    name: "RsBalls",
-    srcs: [
-        "src/**/*.java",
-        ":RsBalls-rscript{RsBalls.srcjar}",
-    ],
-    resource_zips: [
-        ":RsBalls-rscript{RsBalls.res.zip}",
-    ],
-    sdk_version: local_sdk_version,
-}
-
-genrule {
-    name: "RsBalls-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "RsBalls.srcjar",
-        "RsBalls.res.zip",
-    ],
-    cmd: "$(location llvm-rs-cc) -target-api " + target_api_level +
-        "  -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $(locations src/**/*.rscript) &&" +
-        "$(location soong_zip) -srcjar -o $(location RsBalls.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location RsBalls.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/Balls/Android.mk b/tests/java_api/Balls/Android.mk
new file mode 100644
index 0000000..409c735
--- /dev/null
+++ b/tests/java_api/Balls/Android.mk

@@ -0,0 +1,29 @@
+#
+# Copyright (C) 2008 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := RsBalls
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := 14
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/CannyLive/Android.bp b/tests/java_api/CannyLive/Android.bp
deleted file mode 100644
index 24d8a98..0000000
--- a/tests/java_api/CannyLive/Android.bp
+++ /dev/null

@@ -1,65 +0,0 @@
-//
-// Copyright (C) 2015 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "CannyLive",
-    static_libs: [
-        "androidx.legacy_legacy-support-v4",
-        "androidx.appcompat_appcompat",
-    ],
-    aaptflags: [
-        "--extra-packages",
-        "android.support.v7.appcompat",
-    ],
-    srcs: [
-        "src/**/*.java",
-        ":CannyLive-rscript{CannyLive.srcjar}",
-    ],
-    resource_zips: [
-        ":CannyLive-rscript{CannyLive.res.zip}",
-    ],
-    sdk_version: "current",
-}
-
-genrule {
-    name: "CannyLive-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "CannyLive.srcjar",
-        "CannyLive.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -target-api 22 " +
-        "  -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location CannyLive.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location CannyLive.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/CannyLive/Android.mk b/tests/java_api/CannyLive/Android.mk
new file mode 100644
index 0000000..54aeb00
--- /dev/null
+++ b/tests/java_api/CannyLive/Android.mk

@@ -0,0 +1,34 @@
+#
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+LOCAL_USE_AAPT2 :=true
+LOCAL_STATIC_ANDROID_LIBRARIES += androidx.legacy_legacy-support-v4
+LOCAL_STATIC_ANDROID_LIBRARIES += androidx.appcompat_appcompat
+LOCAL_AAPT_FLAGS += --extra-packages android.support.v7.appcompat
+LOCAL_RENDERSCRIPT_TARGET_API := 22
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+LOCAL_SDK_VERSION := current
+
+LOCAL_PACKAGE_NAME := CannyLive
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/CannyLive/AndroidManifest.xml b/tests/java_api/CannyLive/AndroidManifest.xml
index 4764b8f..e7e3299 100644
--- a/tests/java_api/CannyLive/AndroidManifest.xml
+++ b/tests/java_api/CannyLive/AndroidManifest.xml

@@ -16,8 +16,7 @@
         android:theme="@style/AppTheme" >

         <activity

             android:name="com.android.example.cannylive.MainActivity"

-            android:label="@string/app_name"

-            android:exported="true" >

+            android:label="@string/app_name" >

             <intent-filter>

                 <action android:name="android.intent.action.MAIN" />

                 <category android:name="android.intent.category.LAUNCHER" />


diff --git a/tests/java_api/ComputeBenchmark/Android.bp b/tests/java_api/ComputeBenchmark/Android.bp
deleted file mode 100644
index 6a4e96d..0000000
--- a/tests/java_api/ComputeBenchmark/Android.bp
+++ /dev/null

@@ -1,57 +0,0 @@
-//
-// Copyright (C) 2012 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "RsComputeBenchmark",
-    srcs: [
-        "src/**/*.java",
-        ":RsComputeBenchmark-rscript{RsComputeBenchmark.srcjar}",
-    ],
-    resource_zips: [
-        ":RsComputeBenchmark-rscript{RsComputeBenchmark.res.zip}",
-    ],
-    sdk_version: "current",
-    min_sdk_version: "17",
-}
-
-genrule {
-    name: "RsComputeBenchmark-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "RsComputeBenchmark.srcjar",
-        "RsComputeBenchmark.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location RsComputeBenchmark.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location RsComputeBenchmark.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/ComputeBenchmark/Android.mk b/tests/java_api/ComputeBenchmark/Android.mk
new file mode 100644
index 0000000..a3ca785
--- /dev/null
+++ b/tests/java_api/ComputeBenchmark/Android.mk

@@ -0,0 +1,31 @@
+#
+# Copyright (C) 2012 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := RsComputeBenchmark
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+LOCAL_MIN_SDK_VERSION := 17
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/ComputePerf/Android.bp b/tests/java_api/ComputePerf/Android.bp
deleted file mode 100644
index b60397d..0000000
--- a/tests/java_api/ComputePerf/Android.bp
+++ /dev/null

@@ -1,57 +0,0 @@
-//
-// Copyright (C) 2011 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "RsComputePerf",
-    srcs: [
-        "src/**/*.java",
-        ":RsComputePerf-rscript{RsComputePerf.srcjar}",
-    ],
-    resource_zips: [
-        ":RsComputePerf-rscript{RsComputePerf.res.zip}",
-    ],
-    sdk_version: "current",
-    min_sdk_version: "19",
-}
-
-genrule {
-    name: "RsComputePerf-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "RsComputePerf.srcjar",
-        "RsComputePerf.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc)  -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location RsComputePerf.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location RsComputePerf.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/ComputePerf/Android.mk b/tests/java_api/ComputePerf/Android.mk
new file mode 100644
index 0000000..163e7dd
--- /dev/null
+++ b/tests/java_api/ComputePerf/Android.mk

@@ -0,0 +1,31 @@
+#
+# Copyright (C) 2011 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := RsComputePerf
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+LOCAL_MIN_SDK_VERSION := 19
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/GenImages/Android.bp b/tests/java_api/GenImages/Android.bp
deleted file mode 100644
index 520e698..0000000
--- a/tests/java_api/GenImages/Android.bp
+++ /dev/null

@@ -1,29 +0,0 @@
-//
-// Copyright (C) 2013 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "RsGenImages",
-    srcs: [
-        "src/**/*.java",
-    ],
-    sdk_version: "current",
-    min_sdk_version: "14",
-}

diff --git a/tests/java_api/GenImages/Android.mk b/tests/java_api/GenImages/Android.mk
new file mode 100644
index 0000000..d3e00ae
--- /dev/null
+++ b/tests/java_api/GenImages/Android.mk

@@ -0,0 +1,31 @@
+#
+# Copyright (C) 2013 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := RsGenImages
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+LOCAL_MIN_SDK_VERSION := 14
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/HealingBrush/Android.bp b/tests/java_api/HealingBrush/Android.bp
deleted file mode 100644
index b8843d5..0000000
--- a/tests/java_api/HealingBrush/Android.bp
+++ /dev/null

@@ -1,61 +0,0 @@
-//
-// Copyright (C) 2015 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-android_test {
-    name: "HealingBrush",
-    srcs: [
-        "src/**/*.java",
-        ":HealingBrush-rscript{HealingBrush.srcjar}",
-    ],
-    resource_zips: [
-        ":HealingBrush-rscript{HealingBrush.res.zip}",
-    ],
-    static_libs: ["android-support-v8-renderscript"],
-    sdk_version: "current",
-    jni_libs: ["librsjni"],
-}
-
-genrule {
-    name: "HealingBrush-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "HealingBrush.srcjar",
-        "HealingBrush.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-            "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
-            "  -target-api 21 -rs-package-name=androidx.renderscript " +
-            "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-            "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-            "done && " +
-            "$(location soong_zip) -srcjar -o $(location HealingBrush.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-            "$(location soong_zip) -o $(location HealingBrush.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/HealingBrush/Android.mk b/tests/java_api/HealingBrush/Android.mk
new file mode 100644
index 0000000..4ca1251
--- /dev/null
+++ b/tests/java_api/HealingBrush/Android.mk

@@ -0,0 +1,42 @@
+#
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_STATIC_JAVA_LIBRARIES := android-support-v8-renderscript
+
+LOCAL_PACKAGE_NAME := HealingBrush
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+LOCAL_RENDERSCRIPT_TARGET_API := 19
+LOCAL_RENDERSCRIPT_COMPATIBILITY := 18
+
+LOCAL_RENDERSCRIPT_CC := $(LLVM_RS_CC)
+LOCAL_RENDERSCRIPT_INCLUDES_OVERRIDE := \
+    $(TOPDIR)external/clang/lib/Headers \
+    $(TOPDIR)frameworks/rs/script_api/include
+
+LOCAL_RENDERSCRIPT_FLAGS := -rs-package-name=androidx.renderscript
+LOCAL_JNI_SHARED_LIBRARIES := librsjni
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/HelloComputeNDK/Android.bp b/tests/java_api/HelloComputeNDK/Android.bp
deleted file mode 100644
index 7dd25d6..0000000
--- a/tests/java_api/HelloComputeNDK/Android.bp
+++ /dev/null

@@ -1,35 +0,0 @@
-//
-// Copyright (C) 2013 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-android_test {
-    name: "HelloComputeNDK",
-
-    srcs: [
-        "src/**/*.java",
-    ],
-
-    sdk_version: "current",
-
-    jni_libs: ["libhellocomputendk"],
-
-}

diff --git a/tests/java_api/HelloComputeNDK/Android.mk b/tests/java_api/HelloComputeNDK/Android.mk
new file mode 100644
index 0000000..5fe2ffd
--- /dev/null
+++ b/tests/java_api/HelloComputeNDK/Android.mk

@@ -0,0 +1,35 @@
+#
+# Copyright (C) 2013 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_CFLAGS := -Werror -Wall -Wextra
+
+LOCAL_PACKAGE_NAME := HelloComputeNDK
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+
+LOCAL_JNI_SHARED_LIBRARIES := libhellocomputendk
+
+include $(BUILD_PACKAGE)
+include $(LOCAL_PATH)/libhellocomputendk/Android.mk

diff --git a/tests/java_api/HelloComputeNDK/libhellocomputendk/Android.bp b/tests/java_api/HelloComputeNDK/libhellocomputendk/Android.bp
deleted file mode 100644
index 00679ff..0000000
--- a/tests/java_api/HelloComputeNDK/libhellocomputendk/Android.bp
+++ /dev/null

@@ -1,45 +0,0 @@
-// Copyright (C) 2013 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-//
-// This is the shared library included by the JNI test app.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-cc_test_library {
-   name: "libhellocomputendk",
-
-   srcs: [
-        "helloComputeNDK.cpp",
-        "mono.rscript",
-    ],
-    ldflags: ["-Wl,-Bsymbolic"],
-    header_libs: ["jni_headers"],
-    shared_libs: [
-        "libdl",
-        "liblog",
-        "libjnigraphics",
-    ],
-    static_libs: ["libRScpp_static"],
-
-    sdk_version: "current",
-
-    stl: "c++_static",
-
-}

diff --git a/tests/java_api/HelloComputeNDK/libhellocomputendk/Android.mk b/tests/java_api/HelloComputeNDK/libhellocomputendk/Android.mk
new file mode 100644
index 0000000..d0a8d88
--- /dev/null
+++ b/tests/java_api/HelloComputeNDK/libhellocomputendk/Android.mk

@@ -0,0 +1,38 @@
+# Copyright (C) 2013 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# This is the shared library included by the JNI test app.
+#
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
+
+LOCAL_MODULE := libhellocomputendk
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_MODULE_TAGS := tests
+LOCAL_SRC_FILES := helloComputeNDK.cpp mono.rscript
+
+LOCAL_CFLAGS := -Wall -Werror
+LOCAL_LDFLAGS := -Wl,-Bsymbolic
+LOCAL_HEADER_LIBRARIES := jni_headers
+LOCAL_SHARED_LIBRARIES := libdl liblog libjnigraphics
+LOCAL_STATIC_LIBRARIES := libRScpp_static
+
+LOCAL_SDK_VERSION := current
+
+LOCAL_NDK_STL_VARIANT := c++_static
+
+include $(BUILD_SHARED_LIBRARY)

diff --git a/tests/java_api/ImageProcessing/Android.bp b/tests/java_api/ImageProcessing/Android.bp
deleted file mode 100644
index 6ac2167..0000000
--- a/tests/java_api/ImageProcessing/Android.bp
+++ /dev/null

@@ -1,62 +0,0 @@
-//
-// Copyright (C) 2009 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "ImageProcessing",
-    libs: [
-        "android.test.runner.stubs",
-        "android.test.base.stubs",
-    ],
-    static_libs: ["junit"],
-    srcs: [
-        "src/**/*.java",
-        ":ImageProcessing-rscript{ImageProcessing.srcjar}",
-    ],
-    resource_zips: [
-        ":ImageProcessing-rscript{ImageProcessing.res.zip}",
-    ],
-    sdk_version: "current",
-    min_sdk_version: "23",
-}
-
-genrule {
-    name: "ImageProcessing-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "ImageProcessing.srcjar",
-        "ImageProcessing.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location ImageProcessing.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location ImageProcessing.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/ImageProcessing/Android.mk b/tests/java_api/ImageProcessing/Android.mk
new file mode 100644
index 0000000..5ac0d15
--- /dev/null
+++ b/tests/java_api/ImageProcessing/Android.mk

@@ -0,0 +1,35 @@
+#
+# Copyright (C) 2009 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_JAVA_LIBRARIES := android.test.runner.stubs android.test.base.stubs
+
+LOCAL_STATIC_JAVA_LIBRARIES := junit
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := ImageProcessing
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+LOCAL_MIN_SDK_VERSION := 23
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/ImageProcessing2/Android.bp b/tests/java_api/ImageProcessing2/Android.bp
deleted file mode 100644
index 85c3a14..0000000
--- a/tests/java_api/ImageProcessing2/Android.bp
+++ /dev/null

@@ -1,61 +0,0 @@
-//
-// Copyright (C) 2009 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-android_test {
-    name: "ImageProcessing2",
-    srcs: [
-        "src/**/*.java",
-        ":ImageProcessing2-rscript{ImageProcessing2.srcjar}",
-    ],
-    resource_zips: [
-        ":ImageProcessing2-rscript{ImageProcessing2.res.zip}",
-    ],
-    static_libs: ["android-support-v8-renderscript"],
-    sdk_version: "current",
-    jni_libs: ["librsjni"],
-}
-
-genrule {
-    name: "ImageProcessing2-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "ImageProcessing2.srcjar",
-        "ImageProcessing2.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-            "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
-            "  -target-api 21 -rs-package-name=androidx.renderscript " +
-            "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-            "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-            "done && " +
-            "$(location soong_zip) -srcjar -o $(location ImageProcessing2.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-            "$(location soong_zip) -o $(location ImageProcessing2.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/ImageProcessing2/Android.mk b/tests/java_api/ImageProcessing2/Android.mk
new file mode 100644
index 0000000..7165cc3
--- /dev/null
+++ b/tests/java_api/ImageProcessing2/Android.mk

@@ -0,0 +1,39 @@
+#
+# Copyright (C) 2009 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_STATIC_JAVA_LIBRARIES := android-support-v8-renderscript
+
+LOCAL_PACKAGE_NAME := ImageProcessing2
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+LOCAL_RENDERSCRIPT_TARGET_API := 18
+LOCAL_RENDERSCRIPT_COMPATIBILITY := 18
+LOCAL_RENDERSCRIPT_INCLUDES_OVERRIDE := $(TOPDIR)external/clang/lib/Headers \
+                                        $(TOPDIR)frameworks/rs/script_api/include
+
+LOCAL_RENDERSCRIPT_FLAGS := -rs-package-name=androidx.renderscript
+LOCAL_JNI_SHARED_LIBRARIES := librsjni
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/ImageProcessing_jb/Android.bp b/tests/java_api/ImageProcessing_jb/Android.bp
deleted file mode 100644
index 9ffa7ed..0000000
--- a/tests/java_api/ImageProcessing_jb/Android.bp
+++ /dev/null

@@ -1,62 +0,0 @@
-//
-// Copyright (C) 2009 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "ImageProcessingJB",
-    static_libs: ["androidx.test.rules"],
-    libs: [
-        "android.test.runner.stubs",
-        "android.test.base.stubs",
-    ],
-    test_suites: ["device-tests"],
-    srcs: [
-        "src/**/*.java",
-        ":ImageProcessingJB-rscript{ImageProcessingJB.srcjar}",
-    ],
-    resource_zips: [
-        ":ImageProcessingJB-rscript{ImageProcessingJB.res.zip}",
-    ],
-    sdk_version: "current",
-}
-
-genrule {
-    name: "ImageProcessingJB-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "ImageProcessingJB.srcjar",
-        "ImageProcessingJB.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location ImageProcessingJB.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location ImageProcessingJB.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/ImageProcessing_jb/Android.mk b/tests/java_api/ImageProcessing_jb/Android.mk
new file mode 100644
index 0000000..71669e7
--- /dev/null
+++ b/tests/java_api/ImageProcessing_jb/Android.mk

@@ -0,0 +1,34 @@
+#
+# Copyright (C) 2009 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_STATIC_JAVA_LIBRARIES := androidx.test.rules
+LOCAL_JAVA_LIBRARIES := android.test.runner.stubs android.test.base.stubs
+
+LOCAL_MODULE_TAGS := tests
+LOCAL_COMPATIBILITY_SUITE += device-tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := ImageProcessingJB
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/LatencyBenchmark/Android.bp b/tests/java_api/LatencyBenchmark/Android.bp
deleted file mode 100644
index 0f05493..0000000
--- a/tests/java_api/LatencyBenchmark/Android.bp
+++ /dev/null

@@ -1,58 +0,0 @@
-//
-// Copyright (C) 2012 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "RsLatencyBenchmark",
-    srcs: [
-        "src/**/*.java",
-        ":RsLatencyBenchmark-rscript{RsLatencyBenchmark.srcjar}",
-    ],
-    resource_zips: [
-        ":RsLatencyBenchmark-rscript{RsLatencyBenchmark.res.zip}"
-    ],
-    sdk_version: "current",
-    min_sdk_version: "23",
-}
-
-genrule {
-    name: "RsLatencyBenchmark-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "RsLatencyBenchmark.srcjar",
-        "RsLatencyBenchmark.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -target-api 23 " +
-        "  -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location RsLatencyBenchmark.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location RsLatencyBenchmark.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/LatencyBenchmark/Android.mk b/tests/java_api/LatencyBenchmark/Android.mk
new file mode 100644
index 0000000..8df2d05
--- /dev/null
+++ b/tests/java_api/LatencyBenchmark/Android.mk

@@ -0,0 +1,33 @@
+#
+# Copyright (C) 2012 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_RENDERSCRIPT_TARGET_API := 23
+
+LOCAL_PACKAGE_NAME := RsLatencyBenchmark
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+LOCAL_MIN_SDK_VERSION := 23
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/LivePreview/Android.bp b/tests/java_api/LivePreview/Android.bp
deleted file mode 100644
index 0b06972..0000000
--- a/tests/java_api/LivePreview/Android.bp
+++ /dev/null

@@ -1,56 +0,0 @@
-//
-// Copyright (C) 2012 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "PreviewRS",
-    srcs: [
-        "src/**/*.java",
-        ":PreviewRS-rscript{PreviewRS.srcjar}",
-    ],
-    resource_zips: [
-        ":PreviewRS-rscript{PreviewRS.res.zip}",
-    ],
-    sdk_version: "current",
-}
-
-genrule {
-    name: "PreviewRS-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "PreviewRS.srcjar",
-        "PreviewRS.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location PreviewRS.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location PreviewRS.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/LivePreview/Android.mk b/tests/java_api/LivePreview/Android.mk
new file mode 100644
index 0000000..a1cbe5f
--- /dev/null
+++ b/tests/java_api/LivePreview/Android.mk

@@ -0,0 +1,29 @@
+#
+# Copyright (C) 2012 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := PreviewRS
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/MathErr/Android.bp b/tests/java_api/MathErr/Android.bp
deleted file mode 100644
index 628c5c5..0000000
--- a/tests/java_api/MathErr/Android.bp
+++ /dev/null

@@ -1,57 +0,0 @@
-//
-// Copyright (C) 2013 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "RsMathErr",
-    srcs: [
-        "src/**/*.java",
-        ":RsMathErr-rscript{RsMathErr.srcjar}",
-    ],
-    resource_zips: [
-        ":RsMathErr-rscript{RsMathErr.res.zip}"
-    ],
-    sdk_version: "current",
-    min_sdk_version: "19",
-}
-
-genrule {
-    name: "RsMathErr-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "RsMathErr.srcjar",
-        "RsMathErr.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location RsMathErr.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location RsMathErr.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/MathErr/Android.mk b/tests/java_api/MathErr/Android.mk
new file mode 100644
index 0000000..1fae614
--- /dev/null
+++ b/tests/java_api/MathErr/Android.mk

@@ -0,0 +1,31 @@
+#
+# Copyright (C) 2013 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := RsMathErr
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+LOCAL_MIN_SDK_VERSION := 19
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/RSTestBackward/Android.bp b/tests/java_api/RSTestBackward/Android.bp
deleted file mode 100644
index c962be7..0000000
--- a/tests/java_api/RSTestBackward/Android.bp
+++ /dev/null

@@ -1,38 +0,0 @@
-//
-// Copyright (C) 2017 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-android_test {
-    name: "RSTestBackward",
-    sdk_version: "current",
-    srcs: [
-        "src/**/*.java",
-        ":RSUnitTests_java_srcs",
-        ":RSTest-rscript{RSTest.srcjar}",
-    ],
-    resource_zips: [
-        ":RSTest-rscript{RSTest.res.zip}",
-    ],
-    static_libs: ["androidx.test.rules"],
-    test_suites: ["device-tests"],
-    min_sdk_version: "21",
-}

diff --git a/tests/java_api/RSTestBackward/Android.mk b/tests/java_api/RSTestBackward/Android.mk
new file mode 100644
index 0000000..b3111a6
--- /dev/null
+++ b/tests/java_api/RSTestBackward/Android.mk

@@ -0,0 +1,38 @@
+#
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_PACKAGE_NAME := RSTestBackward
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := \
+    $(call all-java-files-under, src)\
+    $(call all-java-files-under, ../RSUnitTests/src)\
+    $(call all-renderscript-files-under, ../RSUnitTests/src)\
+
+LOCAL_STATIC_JAVA_LIBRARIES := androidx.test.rules
+LOCAL_COMPATIBILITY_SUITE := device-tests
+
+LOCAL_RENDERSCRIPT_TARGET_API := current
+LOCAL_MIN_SDK_VERSION := 21
+LOCAL_SDK_VERSION := current
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/RSTest_CompatLib/Android.bp b/tests/java_api/RSTest_CompatLib/Android.bp
deleted file mode 100644
index 74dc966..0000000
--- a/tests/java_api/RSTest_CompatLib/Android.bp
+++ /dev/null

@@ -1,66 +0,0 @@
-//
-// Copyright (C) 2017 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-android_test {
-    name: "RSTest_Compat",
-    sdk_version: "current",
-    srcs: [
-        "src/**/*.java",
-        ":RSUnitTests_java_gui",
-        ":RSUnitTests_java_supportlibsrc_gen",
-        ":RSTest_Compat-rscript{RSTest_Compat.srcjar}",
-    ],
-    resource_zips: [
-        ":RSTest_Compat-rscript{RSTest_Compat.res.zip}",
-    ],
-    static_libs: [
-        "androidx.test.rules",
-        "android-support-v8-renderscript",
-    ],
-    min_sdk_version: "8",
-}
-
-genrule {
-    name: "RSTest_Compat-rscript",
-    srcs: [
-        ":RSUnitTests_rscript_supportlibsrc_gen",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "RSTest_Compat.srcjar",
-        "RSTest_Compat.res.zip",
-    ],
-    cmd: "for f in $(locations :RSUnitTests_rscript_supportlibsrc_gen); do " +
-        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -rs-package-name=androidx.renderscript " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location RSTest_Compat.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location RSTest_Compat.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/RSTest_CompatLib/Android.mk b/tests/java_api/RSTest_CompatLib/Android.mk
new file mode 100644
index 0000000..2df8904
--- /dev/null
+++ b/tests/java_api/RSTest_CompatLib/Android.mk

@@ -0,0 +1,43 @@
+#
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_PACKAGE_NAME := RSTest_Compat
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := \
+    $(call all-java-files-under,src) \
+    $(call all-java-files-under,../RSUnitTests/gui) \
+    $(call all-java-files-under,../RSUnitTests/supportlibsrc_gen) \
+    $(call all-renderscript-files-under,../RSUnitTests/supportlibsrc_gen) \
+
+LOCAL_STATIC_JAVA_LIBRARIES := \
+    androidx.test.rules \
+    android-support-v8-renderscript \
+
+LOCAL_RENDERSCRIPT_TARGET_API := current
+LOCAL_RENDERSCRIPT_COMPATIBILITY := true
+LOCAL_SDK_VERSION := current
+LOCAL_MIN_SDK_VERSION := 8
+
+LOCAL_RENDERSCRIPT_FLAGS := -rs-package-name=androidx.renderscript
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/RSUnitTests/Android.bp b/tests/java_api/RSUnitTests/Android.bp
deleted file mode 100644
index 7d35d9f..0000000
--- a/tests/java_api/RSUnitTests/Android.bp
+++ /dev/null

@@ -1,55 +0,0 @@
-//
-// Copyright (C) 2008 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-filegroup {
-    name: "RSUnitTests_java_srcs",
-    srcs: [
-        "src/**/*.java",
-    ],
-}
-
-filegroup {
-    name: "RSUnitTests_rscript_srcs",
-    srcs: [
-        "src/**/*.rscript",
-    ],
-}
-
-filegroup {
-    name: "RSUnitTests_java_gui",
-    srcs: [
-        "gui/**/*.java",
-    ],
-}
-
-filegroup {
-    name: "RSUnitTests_java_supportlibsrc_gen",
-    srcs: [
-        "supportlibsrc_gen/**/*.java",
-    ],
-}
-
-filegroup {
-    name: "RSUnitTests_rscript_supportlibsrc_gen",
-    srcs: [
-        "supportlibsrc_gen/**/*.rscript",
-    ],
-}

diff --git a/tests/java_api/Refocus/Android.bp b/tests/java_api/Refocus/Android.bp
deleted file mode 100644
index 26af7aa..0000000
--- a/tests/java_api/Refocus/Android.bp
+++ /dev/null

@@ -1,64 +0,0 @@
-//
-// Copyright (C) 2015 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-android_test {
-    name: "Refocus",
-    static_libs: [
-        "android-support-v8-renderscript",
-        "xmp_toolkit",
-    ],
-    srcs: [
-        "src/**/*.java",
-        ":Refocus-rscript{Refocus.srcjar}",
-    ],
-    resource_zips: [
-        ":Refocus-rscript{Refocus.res.zip}",
-    ],
-    sdk_version: "current",
-    jni_libs: ["librsjni"],
-}
-
-genrule {
-    name: "Refocus-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "Refocus.srcjar",
-        "Refocus.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-            "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
-            "  -target-api 21 -rs-package-name=androidx.renderscript " +
-            "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-            "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-            "done && " +
-            "$(location soong_zip) -srcjar -o $(location Refocus.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-            "$(location soong_zip) -o $(location Refocus.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/Refocus/Android.mk b/tests/java_api/Refocus/Android.mk
new file mode 100644
index 0000000..9635005
--- /dev/null
+++ b/tests/java_api/Refocus/Android.mk

@@ -0,0 +1,43 @@
+#
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_STATIC_JAVA_LIBRARIES := android-support-v8-renderscript
+LOCAL_STATIC_JAVA_LIBRARIES += xmp_toolkit
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := Refocus
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SDK_VERSION := current
+LOCAL_RENDERSCRIPT_TARGET_API := 21
+LOCAL_RENDERSCRIPT_COMPATIBILITY := 21
+
+LOCAL_RENDERSCRIPT_CC := $(LLVM_RS_CC)
+LOCAL_RENDERSCRIPT_INCLUDES_OVERRIDE := \
+    $(TOPDIR)external/clang/lib/Headers \
+    $(TOPDIR)frameworks/rs/script_api/include
+
+LOCAL_RENDERSCRIPT_FLAGS := -rs-package-name=androidx.renderscript
+LOCAL_JNI_SHARED_LIBRARIES := librsjni
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/RsCameraDemo/Android.bp b/tests/java_api/RsCameraDemo/Android.bp
deleted file mode 100644
index 9e03305..0000000
--- a/tests/java_api/RsCameraDemo/Android.bp
+++ /dev/null

@@ -1,56 +0,0 @@
-//
-// Copyright (C) 2015 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "RsCameraDemo",
-    srcs: [
-        "src/**/*.java",
-        ":RsCameraDemo-rscript{RsCameraDemo.srcjar}",
-    ],
-    resource_zips: [
-        ":RsCameraDemo-rscript{RsCameraDemo.srcjar}",
-    ],
-    sdk_version: "current",
-}
-
-genrule {
-    name: "RsCameraDemo-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "RsCameraDemo.srcjar",
-        "RsCameraDemo.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location RsCameraDemo.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location RsCameraDemo.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/RsCameraDemo/Android.mk b/tests/java_api/RsCameraDemo/Android.mk
new file mode 100644
index 0000000..fd9daf9
--- /dev/null
+++ b/tests/java_api/RsCameraDemo/Android.mk

@@ -0,0 +1,29 @@
+#
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+LOCAL_SDK_VERSION := current
+
+LOCAL_PACKAGE_NAME := RsCameraDemo
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/RsMinimalTest/Android.bp b/tests/java_api/RsMinimalTest/Android.bp
deleted file mode 100644
index 96bc7bb..0000000
--- a/tests/java_api/RsMinimalTest/Android.bp
+++ /dev/null

@@ -1,28 +0,0 @@
-//
-// Copyright (C) 2018 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "RsMinimalTest",
-    static_libs: ["androidx.test.rules"],
-    srcs: ["src/**/*.java"],
-    sdk_version: "30",
-    test_suites: ["device-tests"],
-}

diff --git a/tests/java_api/RsMinimalTest/Android.mk b/tests/java_api/RsMinimalTest/Android.mk
new file mode 100644
index 0000000..65cba1b
--- /dev/null
+++ b/tests/java_api/RsMinimalTest/Android.mk

@@ -0,0 +1,33 @@
+#
+# Copyright (C) 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+LOCAL_STATIC_JAVA_LIBRARIES := androidx.test.rules
+
+LOCAL_SRC_FILES := $(call all-java-files-under,src)
+
+LOCAL_RENDERSCRIPT_TARGET_API := 21
+LOCAL_SDK_VERSION := 26
+
+LOCAL_PACKAGE_NAME := RsMinimalTest
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_COMPATIBILITY_SUITE := device-tests
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/RsMinimalTest/AndroidManifest.xml b/tests/java_api/RsMinimalTest/AndroidManifest.xml
index 40b5acf..006d320 100644
--- a/tests/java_api/RsMinimalTest/AndroidManifest.xml
+++ b/tests/java_api/RsMinimalTest/AndroidManifest.xml

@@ -17,7 +17,7 @@
     package="com.android.rs.minimaltest">
     <uses-sdk
         android:minSdkVersion="21"
-        android:targetSdkVersion="30" />
+        android:targetSdkVersion="26" />
 
     <application
         android:largeHeap="true"

diff --git a/tests/java_api/RsNbody/Android.bp b/tests/java_api/RsNbody/Android.bp
deleted file mode 100644
index 0118315..0000000
--- a/tests/java_api/RsNbody/Android.bp
+++ /dev/null

@@ -1,58 +0,0 @@
-//
-// Copyright (C) 2015 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "RsNbody",
-    srcs: [
-        "src/**/*.java",
-        ":RsNbody-rscript{RsNbody.srcjar}",
-    ],
-    resource_zips: [
-        ":RsNbody-rscript{RsNbody.res.zip}",
-    ],
-    static_libs: ["androidx.legacy_legacy-support-v4"],
-    sdk_version: "current",
-}
-
-genrule {
-    name: "RsNbody-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "RsNbody.srcjar",
-        "RsNbody.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -target-api 23 " +
-        "  -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location RsNbody.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location RsNbody.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/RsNbody/Android.mk b/tests/java_api/RsNbody/Android.mk
new file mode 100644
index 0000000..4c99c42
--- /dev/null
+++ b/tests/java_api/RsNbody/Android.mk

@@ -0,0 +1,38 @@
+#
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+LOCAL_USE_AAPT2 := true
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_STATIC_ANDROID_LIBRARIES += androidx.legacy_legacy-support-v4
+
+LOCAL_PACKAGE_NAME := RsNbody
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+LOCAL_RENDERSCRIPT_TARGET_API := 23
+
+LOCAL_RENDERSCRIPT_CC := $(LLVM_RS_CC)
+LOCAL_RENDERSCRIPT_INCLUDES_OVERRIDE := \
+    $(TOPDIR)external/clang/lib/Headers \
+    $(TOPDIR)frameworks/rs/script_api/include
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/RsTest/Android.bp b/tests/java_api/RsTest/Android.bp
deleted file mode 100644
index 3695234..0000000
--- a/tests/java_api/RsTest/Android.bp
+++ /dev/null

@@ -1,62 +0,0 @@
-//
-// Copyright (C) 2008 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: [
-        "Android-Apache-2.0",
-    ],
-}
-
-android_test {
-    name: "RSTest",
-    static_libs: ["androidx.test.rules"],
-    srcs: [
-        "src/**/*.java",
-        ":RSUnitTests_java_srcs",
-        ":RSUnitTests_java_gui",
-        ":RSTest-rscript{RSTest.srcjar}",
-    ],
-    resource_zips: [
-        ":RSTest-rscript{RSTest.res.zip}",
-    ],
-    sdk_version: "current",
-    test_suites: ["device-tests"],
-}
-
-genrule {
-    name: "RSTest-rscript",
-    srcs: [
-        ":RSUnitTests_rscript_srcs",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "RSTest.srcjar",
-        "RSTest.res.zip",
-    ],
-    cmd: "for f in $(locations :RSUnitTests_rscript_srcs); do " +
-        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location RSTest.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location RSTest.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/RsTest/Android.mk b/tests/java_api/RsTest/Android.mk
new file mode 100644
index 0000000..4f231b5
--- /dev/null
+++ b/tests/java_api/RsTest/Android.mk

@@ -0,0 +1,37 @@
+#
+# Copyright (C) 2008 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+LOCAL_STATIC_JAVA_LIBRARIES := androidx.test.rules
+
+LOCAL_SRC_FILES := \
+    $(call all-java-files-under,src) \
+    $(call all-java-files-under,../RSUnitTests/gui) \
+    $(call all-java-files-under,../RSUnitTests/src) \
+    $(call all-renderscript-files-under,../RSUnitTests/src) \
+
+LOCAL_RENDERSCRIPT_TARGET_API := current
+
+LOCAL_PACKAGE_NAME := RSTest
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+LOCAL_COMPATIBILITY_SUITE := device-tests
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/RsTest_11/Android.bp b/tests/java_api/RsTest_11/Android.bp
deleted file mode 100644
index 1e176d2..0000000
--- a/tests/java_api/RsTest_11/Android.bp
+++ /dev/null

@@ -1,65 +0,0 @@
-//
-// Copyright (C) 2008 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-// This variable is used to match the 'LOCAL_SDK_VERSION' field in the former Android.mk file.
-local_sdk_version = "11"
-// This variable is used to set the value of the '-target-api' option for the 'llvm-rs-cc' command.
-// Note: it may NOT always be the same as the 'local_sdk_version', due to the existing logic in the Make build system.
-// For the Android.mk to Android.bp conversion, it is recommend to run the build before and after
-// the conversion, to make sure the value of the '-target-api' option to be the same.
-target_api_level = local_sdk_version
-
-android_test {
-    name: "RSTest_v11",
-    srcs: [
-        "src/**/*.java",
-        ":RSTest_v11-rscript{RSTest_v11.srcjar}",
-    ],
-    resource_zips: [
-        ":RSTest_v11-rscript{RSTest_v11.res.zip}"
-    ],
-    sdk_version: local_sdk_version,
-}
-
-genrule {
-    name: "RSTest_v11-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "RSTest_v11.srcjar",
-        "RSTest_v11.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -target-api " + target_api_level +
-        "  -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location RSTest_v11.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location RSTest_v11.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/RsTest_11/Android.mk b/tests/java_api/RsTest_11/Android.mk
new file mode 100644
index 0000000..8741441
--- /dev/null
+++ b/tests/java_api/RsTest_11/Android.mk

@@ -0,0 +1,29 @@
+#
+# Copyright (C) 2008 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := RSTest_v11
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := 11
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/RsTest_14/Android.bp b/tests/java_api/RsTest_14/Android.bp
deleted file mode 100644
index 0f52649..0000000
--- a/tests/java_api/RsTest_14/Android.bp
+++ /dev/null

@@ -1,65 +0,0 @@
-//
-// Copyright (C) 2008 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-// This variable is used to match the 'LOCAL_SDK_VERSION' field in the former Android.mk file.
-local_sdk_version = "14"
-// This variable is used to set the value of the '-target-api' option for the 'llvm-rs-cc' command.
-// Note: it may NOT always be the same as the 'local_sdk_version', due to the existing logic in the Make build system.
-// For the Android.mk to Android.bp conversion, it is recommend to run the build before and after
-// the conversion, to make sure the value of the '-target-api' option to be the same.
-target_api_level = local_sdk_version
-
-android_test {
-    name: "RSTest_v14",
-    srcs: [
-        "src/**/*.java",
-        ":RSTest_v14-rscript{RSTest_v14.srcjar}",
-    ],
-    resource_zips: [
-        ":RSTest_v14-rscript{RSTest_v14.res.zip}",
-    ],
-    sdk_version: local_sdk_version,
-}
-
-genrule {
-    name: "RSTest_v14-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "RSTest_v14.srcjar",
-        "RSTest_v14.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -target-api " + target_api_level +
-        "  -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location RSTest_v14.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location RSTest_v14.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/RsTest_14/Android.mk b/tests/java_api/RsTest_14/Android.mk
new file mode 100644
index 0000000..3008e2e
--- /dev/null
+++ b/tests/java_api/RsTest_14/Android.mk

@@ -0,0 +1,29 @@
+#
+# Copyright (C) 2008 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := RSTest_v14
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := 14
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/RsTest_16/Android.bp b/tests/java_api/RsTest_16/Android.bp
deleted file mode 100644
index 50fce59..0000000
--- a/tests/java_api/RsTest_16/Android.bp
+++ /dev/null

@@ -1,65 +0,0 @@
-//
-// Copyright (C) 2008 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-// This variable is used to match the 'LOCAL_SDK_VERSION' field in the former Android.mk file.
-local_sdk_version = "16"
-// This variable is used to set the value of the '-target-api' option for the 'llvm-rs-cc' command.
-// Note: it may NOT always be the same as the 'local_sdk_version', due to the existing logic in the Make build system.
-// For the Android.mk to Android.bp conversion, it is recommend to run the build before and after
-// the conversion, to make sure the value of the '-target-api' option to be the same.
-target_api_level = local_sdk_version
-
-android_test {
-    name: "RSTest_v16",
-    srcs: [
-        "src/**/*.java",
-        ":RSTest_v16-rscript{RSTest_v16.srcjar}",
-    ],
-    resource_zips: [
-        ":RSTest_v16-rscript{RSTest_v16.res.zip}",
-    ],
-    sdk_version: local_sdk_version,
-}
-
-genrule {
-    name: "RSTest_v16-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "RSTest_v16.srcjar",
-        "RSTest_v16.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -target-api " + target_api_level +
-        "  -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location RSTest_v16.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location RSTest_v16.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/RsTest_16/Android.mk b/tests/java_api/RsTest_16/Android.mk
new file mode 100644
index 0000000..4bb59a2
--- /dev/null
+++ b/tests/java_api/RsTest_16/Android.mk

@@ -0,0 +1,29 @@
+#
+# Copyright (C) 2008 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := RSTest_v16
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := 16
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/SSHealingBrush/Android.bp b/tests/java_api/SSHealingBrush/Android.bp
deleted file mode 100644
index 4e139e4..0000000
--- a/tests/java_api/SSHealingBrush/Android.bp
+++ /dev/null

@@ -1,58 +0,0 @@
-//
-// Copyright (C) 2015 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "SSHealingBrush",
-    srcs: [
-        "src/**/*.java",
-        ":SSHealingBrush-rscript{SSHealingBrush.srcjar}",
-    ],
-    resource_zips: [
-        ":SSHealingBrush-rscript{SSHealingBrush.res.zip}",
-    ],
-    static_libs: ["android-support-v8-renderscript"],
-    sdk_version: "current",
-}
-
-genrule {
-    name: "SSHealingBrush-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "SSHealingBrush.srcjar",
-        "SSHealingBrush.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -target-api 0" +
-        "  -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location SSHealingBrush.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location SSHealingBrush.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/SSHealingBrush/Android.mk b/tests/java_api/SSHealingBrush/Android.mk
new file mode 100644
index 0000000..63154e5
--- /dev/null
+++ b/tests/java_api/SSHealingBrush/Android.mk

@@ -0,0 +1,34 @@
+#
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_STATIC_JAVA_LIBRARIES := android-support-v8-renderscript
+
+LOCAL_PACKAGE_NAME := SSHealingBrush
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+
+LOCAL_RENDERSCRIPT_FLAGS := -target-api 0
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/SampleTest/Android.bp b/tests/java_api/SampleTest/Android.bp
deleted file mode 100644
index 0399177..0000000
--- a/tests/java_api/SampleTest/Android.bp
+++ /dev/null

@@ -1,57 +0,0 @@
-//
-// Copyright (C) 2012 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "SampleRS",
-    srcs: [
-        "src/**/*.java",
-        ":SampleRS-rscript{SampleRS.srcjar}",
-    ],
-    resource_zips: [
-        ":SampleRS-rscript{SampleRS.res.zip}",
-    ],
-    sdk_version: "current",
-    min_sdk_version: "17",
-}
-
-genrule {
-    name: "SampleRS-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "SampleRS.srcjar",
-        "SampleRS.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location SampleRS.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location SampleRS.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/SampleTest/Android.mk b/tests/java_api/SampleTest/Android.mk
new file mode 100644
index 0000000..8152181
--- /dev/null
+++ b/tests/java_api/SampleTest/Android.mk

@@ -0,0 +1,30 @@
+#
+# Copyright (C) 2012 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := SampleRS
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+LOCAL_MIN_SDK_VERSION := 17
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/ScriptGroupTest/Android.bp b/tests/java_api/ScriptGroupTest/Android.bp
deleted file mode 100644
index 2f45df1..0000000
--- a/tests/java_api/ScriptGroupTest/Android.bp
+++ /dev/null

@@ -1,58 +0,0 @@
-//
-// Copyright (C) 2009 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "ScriptGroupTest",
-    libs: ["android.test.runner.stubs"],
-    srcs: [
-        "src/**/*.java",
-        ":ScriptGroupTest-rscript{ScriptGroupTest.srcjar}",
-    ],
-    resource_zips: [
-        ":ScriptGroupTest-rscript{ScriptGroupTest.res.zip}",
-    ],
-    sdk_version: "current",
-}
-
-genrule {
-    name: "ScriptGroupTest-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "ScriptGroupTest.srcjar",
-        "ScriptGroupTest.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -target-api 0" +
-        "  -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location ScriptGroupTest.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location ScriptGroupTest.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/ScriptGroupTest/Android.mk b/tests/java_api/ScriptGroupTest/Android.mk
new file mode 100644
index 0000000..6fed8b9
--- /dev/null
+++ b/tests/java_api/ScriptGroupTest/Android.mk

@@ -0,0 +1,35 @@
+#
+# Copyright (C) 2009 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_JAVA_LIBRARIES := android.test.runner.stubs
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+#LOCAL_STATIC_JAVA_LIBRARIES := android.renderscript
+
+LOCAL_RENDERSCRIPT_FLAGS := -target-api 0
+
+LOCAL_PACKAGE_NAME := ScriptGroupTest
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+
+include $(BUILD_PACKAGE)

diff --git a/tests/java_api/VrDemo/Android.bp b/tests/java_api/VrDemo/Android.bp
deleted file mode 100644
index 5072f2c..0000000
--- a/tests/java_api/VrDemo/Android.bp
+++ /dev/null

@@ -1,56 +0,0 @@
-//
-// Copyright (C) 2015 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    // See: http://go/android-license-faq
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_test {
-    name: "VrDemo",
-    srcs: [
-        "src/**/*.java",
-        ":VrDemo-rscript{VrDemo.srcjar}",
-    ],
-    resource_zips: [
-        ":VrDemo-rscript{VrDemo.res.zip}",
-    ],
-    sdk_version: "current",
-}
-
-genrule {
-    name: "VrDemo-rscript",
-    srcs: [
-        "src/**/*.rscript",
-        ":rs_script_api",
-        ":rs_clang_headers",
-    ],
-    tools: [
-        "llvm-rs-cc",
-        "soong_zip",
-    ],
-    out: [
-        "VrDemo.srcjar",
-        "VrDemo.res.zip",
-    ],
-    cmd: "for f in $(locations src/**/*.rscript); do " +
-        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
-        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
-        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
-        "done && " +
-        "$(location soong_zip) -srcjar -o $(location VrDemo.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
-        "$(location soong_zip) -o $(location VrDemo.res.zip) -C $(genDir)/res -D $(genDir)/res",
-}

diff --git a/tests/java_api/VrDemo/Android.mk b/tests/java_api/VrDemo/Android.mk
new file mode 100644
index 0000000..e33fb77
--- /dev/null
+++ b/tests/java_api/VrDemo/Android.mk

@@ -0,0 +1,29 @@
+#
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+LOCAL_SDK_VERSION := current
+
+LOCAL_PACKAGE_NAME := VrDemo
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+include $(BUILD_PACKAGE)

diff --git a/tests/lldb/.gitignore b/tests/lldb/.gitignore
new file mode 100644
index 0000000..b79973f
--- /dev/null
+++ b/tests/lldb/.gitignore

@@ -0,0 +1,65 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+*.swp
+
+*~
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+
+results.xml
+LLDBTestsuiteLog.txt

diff --git a/tests/lldb/Android.mk b/tests/lldb/Android.mk
new file mode 100644
index 0000000..95904a8
--- /dev/null
+++ b/tests/lldb/Android.mk

@@ -0,0 +1,4 @@
+LOCAL_PATH:=$(call my-dir)
+
+include $(call all-makefiles-under,$(LOCAL_PATH))
+

diff --git a/tests/lldb/README.txt b/tests/lldb/README.txt
new file mode 100644
index 0000000..70e93bc
--- /dev/null
+++ b/tests/lldb/README.txt

@@ -0,0 +1,307 @@
+- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----
+-   LLDB for Renderscript Test Suite
+-
+-   16/03/2016
+- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----
+
+Overview:
+
+    The LLDB for Renderscript test suite is written in python and relies on
+    LLDB's python bindings.  The test suite will push several test app's onto
+    a target device, and makes coordinated use of LLDB and ADB to run automated
+    debug sessions.
+
+Set up the test suite:
+
+    Check out the AOSP and build it for your target. Navigate to
+    /frameworks/rs/test/lldb and type mm.  This should successfully build the
+    binaries that the testsuite uses. They will be placed in
+    <path to out folder>/target/product/<product code name>/data/app, system/lib
+    and system/bin.
+
+Prerequisite:
+
+    An lldb-server executable must be present on your device/emulator.
+    LLDB must be compiled on your host machine along with its python interface.
+    lldb-server and lldb should be built from the same source revisions.
+
+Running the test suite:
+
+    The test suite can be run via the following command:
+
+        > python run_tests.py [-h]
+                              [--config path]
+                              [--device DEVICE]
+                              [--test path]
+                              [--install-only]
+                              [--no-install]
+                              [--no-uninstall]
+                              [--print-to-stdout]
+                              [--verbose]
+                              [--wimpy]
+                              [--run-emu]
+                              [--adb-path ADB_PATH]
+                              [--aosp-product-path AOSP_PRODUCT_PATH]
+                              [--blocklist BLOCKLIST [BLOCKLIST ...]]
+                              [--device-port DEVICE_PORT]
+                              [--emu-cmd EMU_CMD]
+                              [--host-port HOST_PORT]
+                              [--lldb-path LLDB_PATH]
+                              [--lldb-server-path-device LLDB_SERVER_PATH_DEVICE]
+                              [--lldb-server-path-host LLDB_SERVER_PATH_HOST]
+                              [--log-file-path LOG_FILE_PATH]
+                              [--results-file-path RESULTS_FILE_PATH]
+                              [--timeout TIMEOUT]
+
+        optional arguments:
+          -h, --help            show this help message and exit
+          --config path, -c path
+                                Path to a custom config file.
+          --device DEVICE, -d DEVICE
+                                Specify the device id of the device to test on.
+          --test path, -t path  Specify a specific test to run.
+          --install-only        It only runs the pre-run stage of the test suite. It
+                                installs the required APKs but does not execute the
+                                tests.
+          --no-install, -n      Stop the test suite installing apks to device.
+          --no-uninstall        Stop the test suite uninstalling apks after
+                                completion.
+          --print-to-stdout     Print all logging information to standard out.
+          --verbose, -v         Store extra info in the log.
+          --wimpy, -w           Test only a core subset of features.
+          --run-emu             Spawn an emulator and run the test suite on that.
+                                Specify the emulator command line in the config file
+                                or with -emu-cmd.
+          --adb-path ADB_PATH   Path to android debug bridge on the host.
+          --aosp-product-path AOSP_PRODUCT_PATH
+                                The path to the "out" folder of the AOSP repository.
+          --blocklist BLOCKLIST [BLOCKLIST ...]
+                                Provide a test blocklist for skipping specific tests.
+                                To specify the blocklist from the command line the
+                                following can be used: --blocklist test1.py test2.py
+                                ...
+          --device-port DEVICE_PORT
+                                Specify the port number that lldb-server (on the
+                                device) listens on. When lldb-server is spawned on the
+                                device it will listen on this port. Each successive
+                                test will increment onwards from this port.
+          --emu-cmd EMU_CMD     The command line for the emulator (if using -run-emu).
+          --host-port HOST_PORT
+                                Specify host port which lldb-server will be forwarded
+                                to. Specify the starting host port number that lldb-
+                                server (on the target) will be forwarded to on the
+                                host. Each successive test will increment onwards from
+                                this initial port.
+          --lldb-path LLDB_PATH
+                                The path to lldb executable on the host.
+          --lldb-server-path-device LLDB_SERVER_PATH_DEVICE
+                                Path to the lldb-server executable on the device.
+          --lldb-server-path-host LLDB_SERVER_PATH_HOST
+                                Path to the lldb-server executable on host (if using
+                                -run-emu).
+          --log-file-path LOG_FILE_PATH
+                                The path to the file where the log will be written.
+          --results-file-path RESULTS_FILE_PATH
+                                The path to the file where junit results.xml will be
+                                written.
+          --timeout TIMEOUT     Timeout period for a single command, expressed in
+                                seconds
+
+    An optional config file can be passed to the test suite which will provide
+    details of your specific environment. The user file should define a custom
+    class inheriting from Config in config.py. The class Config presents the
+    default set of options, available to be overridden.
+
+    All options in the config file can also be specified on the command line.
+
+    If your config and command line do not specify a path to the host lldb,
+    the PYTHONPATH environment variable must be set.  The appropriate value to
+    set this to can be obtained by running the following command:
+
+        > lldb -P
+
+    This will print out a path to the lldb python bindings on your local machine.
+
+Build Requirements:
+
+
+    The following revisions are from the llvm git mirror:
+
+    llvm : 5786b73
+    clang: b6d0b32
+    lldb : 98712eb
+
+    lldb has the following dependencies:
+
+      Python2.7.6
+      swig2.0
+      lldb-server
+
+Building LLDB python bindings:
+
+    Linux:
+
+        Build instructions for Linux lldb can be found on the official lldb web
+        page:
+
+          http://lldb.llvm.org/build.html
+
+        The following CMake variables should be enabled when generating:
+
+          LLDB_ENABLE_PYTHON_SCRIPTS_SWIG_API_GENERATION = True
+
+        As a post build step, swig will generate the python bindings for lldb.
+
+    Windows:
+
+        Prerequisites:
+
+            Visual Studio 2015
+            Custom x64 Python2.7.10 package:
+                http://p-nand-q.com/python/2015.08.07-Python2710-x64-vs2015.7z
+            Swig Version 3.0.5 (Added to the $PATH)
+            Cmake 3.4.0 (So we can generate Visual Studio 2015 solutions)
+
+        Build Python for Windows:
+
+            http://p-nand-q.com/python/building-python-27-with-vs2010.html
+
+            The important thing here is that the above python distribution
+            contains debug versions of the libraries and is built with the same
+            Visual Studio version we are using so the runtimes do not conflict.
+
+        Build LLDB on Windows:
+
+            Select the Cmake generator "Microsoft Visual Studio 2015 Win64".
+            The following CMake variables should be enabled when generating:
+
+                LLDB_DISABLE_PYTHON=False
+                LLDB_RELOCATABLE_PYTHON=False
+                PYTHON_HOME=<path to the above python release>
+
+            Using cmake-gui is a good idea and lets you make sure that swig has
+            been correctly detected.
+
+            In the CMake configure step, you should see something similar to the
+            following:
+            -- Found Python version 2.7.9
+            -- LLDB Found PythonExecutable: E:/Python27/python.exe and
+            E:/Python27/python_d.exe
+            -- LLDB Found PythonLibs: E:/Python27/libs/python27.lib and
+            E:/Python27/libs/python27_d.lib
+            -- LLDB Found PythonDLL: E:/Python27/python27.dll and
+            E:/Python27/python27_d.dll
+            -- LLDB Found PythonIncludeDirs: E:/Python27/Include
+            LLDB version: 3.8.0
+            Could NOT find Doxygen (missing:  DOXYGEN_EXECUTABLE)
+            Found PythonInterp:
+            $<$<CONFIG:Debug>:E:/Python27/python_d.exe>$<$<NOT:$<CONFIG:Debug>>:E:/Python27/python.exe>
+            (found version "1.4")
+
+            Build LLDB as a RelWithDebInfo build, because debug builds of lldb
+            produce an lldb_d.pyd file, but the __init__.py still refers to
+            lldb.pyd.
+
+            The LLDB python bindings should have built in
+            "llvm_build\RelWithDebInfo\lib\site-packages\lldb". Point the test
+            suite towards "llvm_build\RelWithDebInfo\bin\lldb.exe".
+            When running the test suite itself, make sure to use the python
+            executable from the custom package.
+
+A typical test transcript:
+
+    Located ADB
+    Located device ZX1G427S2S
+    Pushing all tests...
+    Pushed all tests
+    Pre run complete
+    Found 86 tests
+    Running test_allocation_dump_2_cpp.py: PASS
+    Running test_breakpoint_fileline_multiple_rs_files_cpp.py: PASS
+    Running test_read_local_cpp.py: PASS
+    Running test_breakpoint_kernel_multiple_rs_files_cpp.py: PASS
+    Running test_multiple_rs_files_cpp.py: PASS
+    Running test_breakpoint_kernel_all_multiple_rs_files_cpp.py: PASS
+    Running test_dwarf_lang_cpp.py: PASS
+    Running test_write_global_element_cpp.py: PASS
+    Running test_allocation_list_cpp.py: PASS
+    Running test_breakpoint_coordinate_cpp.py: PASS
+    Running test_rs_consts_cpp.py: PASS
+    Running test_allocation_file_cpp.py: PASS
+    Running test_allocation_dump_1_cpp.py: PASS
+    Running test_source_step_cpp.py: PASS
+    Running test_breakpoint_kernel_2_cpp.py: PASS
+    Running test_backtrace_cpp.py: PASS
+    Running test_language_subcmds_no_debug_cpp.py: PASS
+    Running test_breakpoint_kernel_1_cpp.py: PASS
+    Running test_language_subcmds_cpp.py: PASS
+    Running test_write_global_cpp.py: PASS
+    Running test_invoke_fun_cpp.py: PASS
+    Running test_breakpoint_fileline_cpp.py: PASS
+    Running test_write_local_cpp.py: PASS
+    Running test_breakpoint_kernel_all_cpp.py: PASS
+    Running test_write_local_element_cpp.py: PASS
+    Running test_call_api_funs_cpp.py: PASS
+    Running test_coordinates_cpp.py: PASS
+    Running test_read_global_cpp.py: PASS
+    Running test_language_subcmds.py: PASS
+    Running test_coordinates.py: PASS
+    Running test_language_subcmds_no_debug.py: PASS
+    Running test_read_local.py: PASS
+    Running test_call_api_funs.py: PASS
+    Running test_breakpoint_kernel_1.py: PASS
+    Running test_breakpoint_fileline.py: PASS
+    Running test_breakpoint_fileline_multiple_rs_files.py: PASS
+    Running test_rs_consts.py: PASS
+    Running test_invoke_fun.py: PASS
+    Running test_write_local_element.py: PASS
+    Running test_source_step.py: PASS
+    Running test_allocation_file.py: PASS
+    Running test_allocation_list.py: PASS
+    Running test_breakpoint_kernel_multiple_rs_files.py: PASS
+    Running test_allocation_dump_1.py: PASS
+    Running test_breakpoint_kernel_all.py: PASS
+    Running test_allocation_dump_2.py: PASS
+    Running test_allocation_dump_struct.py: PASS
+    Running test_read_global.py: PASS
+    Running test_language.py: PASS
+    Running test_dwarf_lang.py: PASS
+    Running test_breakpoint_coordinate.py: PASS
+    Running test_write_global.py: PASS
+    Running test_multiple_rs_files.py: PASS
+    Running test_write_global_element.py: PASS
+    Running test_breakpoint_kernel_all_multiple_rs_files.py: PASS
+    Running test_breakpoint_kernel_2.py: PASS
+    Running test_write_local.py: PASS
+    Running test_backtrace.py: PASS
+    Running test_call_api_funs_jni.py: PASS
+    Running test_invoke_fun_jni.py: PASS
+    Running test_allocation_dump_1_jni.py: PASS
+    Running test_breakpoint_fileline_multiple_rs_files_jni.py: PASS
+    Running test_allocation_file_jni.py: PASS
+    Running test_breakpoint_fileline_jni.py: PASS
+    Running test_source_step_jni.py: PASS
+    Running test_coordinates_jni.py: PASS
+    Running test_rs_consts_jni.py: PASS
+    Running test_breakpoint_kernel_all_multiple_rs_files_jni.py: PASS
+    Running test_multiple_rs_files_jni.py: PASS
+    Running test_allocation_dump_2_jni.py: PASS
+    Running test_allocation_list_jni.py: PASS
+    Running test_write_local_element_jni.py: PASS
+    Running test_breakpoint_kernel_all_jni.py: PASS
+    Running test_breakpoint_coordinate_jni.py: PASS
+    Running test_language_subcmds_no_debug_jni.py: PASS
+    Running test_read_local_jni.py: PASS
+    Running test_dwarf_lang_jni.py: PASS
+    Running test_breakpoint_kernel_2_jni.py: PASS
+    Running test_breakpoint_kernel_multiple_rs_files_jni.py: PASS
+    Running test_write_global_element_jni.py: PASS
+    Running test_breakpoint_kernel_1_jni.py: PASS
+    Running test_read_global_jni.py: PASS
+    Running test_language_subcmds_jni.py: PASS
+    Running test_write_global_jni.py: PASS
+    Running test_backtrace_jni.py: PASS
+    Running test_write_local_jni.py: PASS
+    Uninstalled/Deleted all tests
+    86 of 86 passed
+    100% rate

diff --git a/tests/lldb/config.py b/tests/lldb/config.py
new file mode 100644
index 0000000..bc478d0
--- /dev/null
+++ b/tests/lldb/config.py

@@ -0,0 +1,116 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''LLDB-Renderscript test suite configuration file.
+
+This file contains the default test suite config which will be used in the
+case a developer did not supply a custom one.'''
+
+import os
+from tests.harness.decorators import deprecated
+
+
+class Config(object):
+    '''Test suite configuration object.
+
+    The Config class is used by the test suite to abstract the specifics of a
+    user's local setup.  This config can be overridden by specifying a custom
+    config on the command line.'''
+    # pylint: disable=no-self-use
+
+    @property
+    def adb_path(self):
+        '''Path to android debug bridge on the host.'''
+        return 'adb'
+
+    @property
+    def host_port(self):
+        '''Specify host port which lldb-server will be forwarded to.
+
+        Specify the starting host port number that lldb-server (on the target)
+        will be forwarded to on the host. Each successive test will increment
+        onwards from this initial port.'''
+        return 1234
+
+    @property
+    def device_port(self):
+        '''Specify the port number that lldb-server (on the device) listens on.
+
+        When lldb-server is spawned on the device it will listen on this port.
+        Each successive test will increment onwards from this port.'''
+        return 1234
+
+    @property
+    def lldb_server_path_device(self):
+        '''Path to the lldb-server executable on the device.'''
+        return '/data/lldb-server'
+
+    @property
+    def lldb_server_path_host(self):
+        '''Path to the lldb-server executable on host (if using -run-emu).'''
+        return 'lldb-server'
+
+    @property
+    def aosp_product_path(self):
+        '''The path to the "out" folder of the AOSP repository.'''
+        return os.getenv('ANDROID_PRODUCT_OUT')
+
+    @property
+    def log_file_path(self):
+        '''The path to the file where the log will be written.'''
+        return os.path.join(os.getcwd(), 'LLDBTestsuiteLog.txt')
+
+    @property
+    def results_file_path(self):
+        '''The path to the file where junit results.xml will be written.'''
+        return os.path.join(os.getcwd(), 'results.xml')
+
+    @property
+    def lldb_path(self):
+        '''The path to lldb executable on the host.'''
+        return 'lldb'
+
+    @property
+    def blocklist(self):
+        '''Provide a test blocklist for skipping specific tests.
+
+        To specify the blocklist from the command line the following can be
+        used: --blocklist test1.py test2.py ...'''
+        return []
+
+    @property
+    def verbose(self):
+        '''Flag to indicate whether to store extra output in the logs.'''
+        return False
+
+    @property
+    def device(self):
+        '''Specify the device id of the device to run on.
+
+        When multiple devices or emulators are present, a specific device to
+        use while testing can be indicated here.'''
+        return os.environ.get('ANDROID_SERIAL')
+
+    @property
+    def timeout(self):
+        '''Timeout period for a single command, expressed in seconds'''
+        return 60 * 15
+
+    @property
+    @deprecated()
+    def emu_cmd(self):
+        '''The command line for the emulator (if using -run-emu).'''
+        return os.path.join(os.path.dirname(__file__), '..', '..', '..', '..',
+                            'prebuilts', 'android-emulator', 'linux-x86_64',
+                            'emulator')

diff --git a/tests/lldb/cpp/Allocations/Allocations.cpp b/tests/lldb/cpp/Allocations/Allocations.cpp
new file mode 100644
index 0000000..4ad546d
--- /dev/null
+++ b/tests/lldb/cpp/Allocations/Allocations.cpp

@@ -0,0 +1,416 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <RenderScript.h>
+
+#include "ScriptC_allocs.h"
+
+sp<RS> mRS;
+sp<Allocation> mBoolAllocation;  // boolean
+
+sp<Allocation> mCharAllocation;  // char
+sp<Allocation> mChar2Allocation;   // char2
+sp<Allocation> mChar3Allocation;   // char3
+sp<Allocation> mChar4Allocation;   // char4
+
+sp<Allocation> mUCharAllocation;   // uchar
+sp<Allocation> mUChar2Allocation;  // uchar2
+sp<Allocation> mUChar3Allocation;  // uchar3
+sp<Allocation> mUChar4Allocation;  // uchar4
+
+sp<Allocation> mShortAllocation;   // short
+sp<Allocation> mShort2Allocation;  // short2
+sp<Allocation> mShort3Allocation;  // short3
+sp<Allocation> mShort4Allocation;  // short4
+
+sp<Allocation> mUShortAllocation;  // ushort
+sp<Allocation> mUShort2Allocation; // ushort2
+sp<Allocation> mUShort3Allocation; // ushort3
+sp<Allocation> mUShort4Allocation; // ushort4
+
+sp<Allocation> mIntAllocation;   // int
+sp<Allocation> mInt2Allocation;  // int2
+sp<Allocation> mInt3Allocation;  // int3
+sp<Allocation> mInt4Allocation;  // int4
+
+sp<Allocation> mUIntAllocation;  // uint
+sp<Allocation> mUInt2Allocation;   // uint2
+sp<Allocation> mUInt3Allocation;   // uint3
+sp<Allocation> mUInt4Allocation;   // uint4
+
+sp<Allocation> mLongAllocation;  // long
+sp<Allocation> mLong2Allocation;   // long2
+sp<Allocation> mLong3Allocation;   // long3
+sp<Allocation> mLong4Allocation;   // long4
+
+sp<Allocation> mULongAllocation;   // ulong
+sp<Allocation> mULong2Allocation;  // ulong2
+sp<Allocation> mULong3Allocation;  // ulong3
+sp<Allocation> mULong4Allocation;  // ulong4
+
+sp<Allocation> mHalfAllocation;  // half
+sp<Allocation> mHalf2Allocation;   // half2
+sp<Allocation> mHalf3Allocation;   // half3
+sp<Allocation> mHalf4Allocation;   // half4
+
+sp<Allocation> mFloatAllocation;   // float
+sp<Allocation> mFloat2Allocation;  // float2
+sp<Allocation> mFloat3Allocation;  // float3
+sp<Allocation> mFloat4Allocation;  // float4
+
+sp<Allocation> mDoubleAllocation;  // double
+sp<Allocation> mDouble2Allocation; // double2
+sp<Allocation> mDouble3Allocation; // double3
+sp<Allocation> mDouble4Allocation; // double4
+
+const int mAllocSize = 24; // Needs to be < CHAR_MAX and divisible by 4.
+const int mBitmapSize = 64;
+
+void createSignedAllocations() {
+    Type::Builder typeI8Builder(mRS, Element::I8(mRS));
+    typeI8Builder.setX(1); // One element here to test 16 byte memory alignment
+    typeI8Builder.setY(3);
+    typeI8Builder.setZ(8);
+
+    mCharAllocation = Allocation::createTyped(mRS, typeI8Builder.create());
+    mChar2Allocation = Allocation::createSized(mRS, Element::I8_2(mRS), mAllocSize / 2);
+    mChar3Allocation = Allocation::createSized(mRS, Element::I8_3(mRS), mAllocSize / 4);
+    mChar4Allocation = Allocation::createSized(mRS, Element::I8_4(mRS), mAllocSize / 4);
+
+    Type::Builder typeI16_2Builder(mRS, Element::I16_2(mRS));
+    typeI16_2Builder.setX(6);
+    typeI16_2Builder.setY(1);
+    typeI16_2Builder.setZ(2);
+
+    mShortAllocation = Allocation::createSized(mRS, Element::I16(mRS), mAllocSize);
+    mShort2Allocation = Allocation::createTyped(mRS, typeI16_2Builder.create());
+    mShort3Allocation = Allocation::createSized(mRS, Element::I16_3(mRS), mAllocSize / 4);
+    mShort4Allocation = Allocation::createSized(mRS, Element::I16_4(mRS), mAllocSize / 4);
+
+    Type::Builder typeI32_3Builder(mRS, Element::I32_3(mRS));
+    typeI32_3Builder.setX(3);
+    typeI32_3Builder.setY(2);
+
+    mIntAllocation = Allocation::createSized(mRS, Element::I32(mRS), mAllocSize);
+    mInt2Allocation = Allocation::createSized(mRS, Element::I32_2(mRS), mAllocSize / 2);
+    mInt3Allocation = Allocation::createTyped(mRS, typeI32_3Builder.create());
+    mInt4Allocation = Allocation::createSized(mRS, Element::I32_4(mRS), mAllocSize / 4);
+
+    Type::Builder typeI64_4Builder(mRS, Element::I64_4(mRS));
+    typeI64_4Builder.setX(1);
+    typeI64_4Builder.setY(6);
+
+    mLongAllocation = Allocation::createSized(mRS, Element::I64(mRS), mAllocSize);
+    mLong2Allocation = Allocation::createSized(mRS, Element::I64_2(mRS), mAllocSize / 2);
+    mLong3Allocation = Allocation::createSized(mRS, Element::I64_3(mRS), mAllocSize / 4);
+    mLong4Allocation = Allocation::createTyped(mRS, typeI64_4Builder.create());
+
+    mBoolAllocation = Allocation::createSized(mRS, Element::BOOLEAN(mRS), mAllocSize);
+}
+
+void initSignedAllocations() {
+    char *buffer_char = new char[mAllocSize];
+    short *buffer_short = new short[mAllocSize];
+    int *buffer_int = new int[mAllocSize];
+    int64_t *buffer_long = new int64_t[mAllocSize];
+    char *buffer_bool = new char[mAllocSize];
+
+    for(int i = 0; i < mAllocSize; ++i) {
+        buffer_char[i] = (char) i;
+        buffer_short[i] = (short) i;
+        buffer_int[i] = (int) i;
+        buffer_long[i] = (int64_t) i;
+        buffer_bool[i] =  (char) (0x01 & i);
+    }
+
+    mCharAllocation->copy3DRangeFrom(0, 0, 0, 1, 3, 8, buffer_char);
+    mChar2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_char);
+    mChar3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
+    mChar4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
+
+    delete [] buffer_char;
+
+    mShortAllocation->copy1DRangeFrom(0, mAllocSize, buffer_short);
+    mShort2Allocation->copy3DRangeFrom(0, 0, 0, 6, 1, 2, buffer_short);
+    mShort3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_short);
+    mShort4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_short);
+
+    delete [] buffer_short;
+
+    mIntAllocation->copy1DRangeFrom(0, mAllocSize, buffer_int);
+    mInt2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_int);
+    mInt3Allocation->copy2DRangeFrom(0, 0, 3, 2, buffer_int);
+    mInt4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_int);
+
+    delete [] buffer_int;
+
+    mLongAllocation->copy1DRangeFrom(0, mAllocSize, buffer_long);
+    mLong2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_long);
+    mLong3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_long);
+    mLong4Allocation->copy2DRangeFrom(0, 0, 1, 6, buffer_long);
+
+    delete [] buffer_long;
+
+    mBoolAllocation->copy1DRangeFrom(0, mAllocSize, buffer_bool);
+
+    delete [] buffer_bool;
+}
+
+void createUnsignedAllocations() {
+    Type::Builder typeU8_2Builder(mRS, Element::U8_2(mRS));
+    typeU8_2Builder.setX(2);
+    typeU8_2Builder.setY(6);
+
+    mUCharAllocation = Allocation::createSized(mRS, Element::U8(mRS), mAllocSize);
+    mUChar2Allocation = Allocation::createTyped(mRS, typeU8_2Builder.create());
+    mUChar3Allocation = Allocation::createSized(mRS, Element::U8_3(mRS), mAllocSize / 4);
+    mUChar4Allocation = Allocation::createSized(mRS, Element::U8_4(mRS), mAllocSize / 4);
+
+    Type::Builder typeU16_3Builder(mRS, Element::U16_3(mRS));
+    typeU16_3Builder.setX(1);
+    typeU16_3Builder.setY(6);
+
+    mUShortAllocation = Allocation::createSized(mRS, Element::U16(mRS), mAllocSize);
+    mUShort2Allocation = Allocation::createSized(mRS, Element::U16_2(mRS), mAllocSize / 2);
+    mUShort3Allocation = Allocation::createTyped(mRS, typeU16_3Builder.create());
+    mUShort4Allocation = Allocation::createSized(mRS, Element::U16_4(mRS), mAllocSize / 4);
+
+    Type::Builder typeU32_4Builder(mRS, Element::U32_4(mRS));
+    typeU32_4Builder.setX(1);
+    typeU32_4Builder.setY(1);
+    typeU32_4Builder.setZ(6);
+
+    mUIntAllocation = Allocation::createSized(mRS, Element::U32(mRS), mAllocSize);
+    mUInt2Allocation = Allocation::createSized(mRS, Element::U32_2(mRS), mAllocSize / 2);
+    mUInt3Allocation = Allocation::createSized(mRS, Element::U32_3(mRS), mAllocSize / 4);
+    mUInt4Allocation = Allocation::createTyped(mRS, typeU32_4Builder.create());
+
+    Type::Builder typeU64Builder(mRS, Element::U64(mRS));
+    typeU64Builder.setX(4);
+    typeU64Builder.setY(3);
+    typeU64Builder.setZ(2);
+
+    mULongAllocation = Allocation::createTyped(mRS, typeU64Builder.create());
+    mULong2Allocation = Allocation::createSized(mRS, Element::U64_2(mRS), mAllocSize / 2);
+    mULong3Allocation = Allocation::createSized(mRS, Element::U64_3(mRS), mAllocSize / 4);
+    mULong4Allocation = Allocation::createSized(mRS, Element::U64_4(mRS), mAllocSize / 4);
+}
+
+void initUnsignedAllocations() {
+    char *buffer_char = new char[mAllocSize];
+    short *buffer_short = new short[mAllocSize];
+    int *buffer_int = new int[mAllocSize];
+    uint64_t *buffer_long = new uint64_t[mAllocSize];
+
+    for(int i = 0; i < mAllocSize; ++i) {
+        buffer_char[i] = (char) i;
+        buffer_short[i] = (short) i;
+        buffer_int[i] = (int) i;
+        buffer_long[i] = (uint64_t) i;
+    }
+
+    mUCharAllocation->copy1DRangeFrom(0, mAllocSize, buffer_char);
+    mUChar2Allocation->copy2DRangeFrom(0, 0, 2, 6, buffer_char);
+    mUChar3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
+    mUChar4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
+
+    delete [] buffer_char;
+
+    mUShortAllocation->copy1DRangeFrom(0, mAllocSize, buffer_short);
+    mUShort2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_short);
+    mUShort3Allocation->copy2DRangeFrom(0, 0, 1, 6, buffer_short);
+    mUShort4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_short);
+
+    delete [] buffer_short;
+
+    mUIntAllocation->copy1DRangeFrom(0, mAllocSize, buffer_int);
+    mUInt2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_int);
+    mUInt3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_int);
+    mUInt4Allocation->copy3DRangeFrom(0, 0, 0, 1, 1, 6, buffer_int);
+
+    delete [] buffer_int;
+
+    mULongAllocation->copy3DRangeFrom(0, 0, 0, 4, 3, 2, buffer_long);
+    mULong2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_long);
+    mULong3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_long);
+    mULong4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_long);
+
+    delete [] buffer_long;
+}
+
+void createFloatAllocations() {
+    Type::Builder typeF16_3Builder(mRS, Element::F16_3(mRS));
+    typeF16_3Builder.setX(1);
+    typeF16_3Builder.setY(6);
+
+    mHalfAllocation = Allocation::createSized(mRS, Element::F16(mRS), mAllocSize);
+    mHalf2Allocation = Allocation::createSized(mRS, Element::F16_2(mRS), mAllocSize / 2);
+    mHalf3Allocation = Allocation::createTyped(mRS, typeF16_3Builder.create());
+    mHalf4Allocation = Allocation::createSized(mRS, Element::F16_4(mRS), mAllocSize / 4);
+
+    Type::Builder typeF32_4Builder(mRS, Element::F32_4(mRS));
+    typeF32_4Builder.setX(3);
+    typeF32_4Builder.setY(2);
+
+    mFloatAllocation = Allocation::createSized(mRS, Element::F32(mRS), mAllocSize);
+    mFloat2Allocation = Allocation::createSized(mRS, Element::F32_2(mRS), mAllocSize / 2);
+    mFloat3Allocation = Allocation::createSized(mRS, Element::F32_3(mRS), mAllocSize / 4);
+    mFloat4Allocation = Allocation::createTyped(mRS, typeF32_4Builder.create());
+
+    Type::Builder typeF64_2Builder(mRS, Element::F64_2(mRS));
+    typeF64_2Builder.setX(4);
+    typeF64_2Builder.setY(1);
+    typeF64_2Builder.setZ(3);
+
+    mDoubleAllocation = Allocation::createSized(mRS, Element::F64(mRS), mAllocSize);
+    mDouble2Allocation = Allocation::createTyped(mRS, typeF64_2Builder.create());
+
+    Type::Builder typeF64_3Builder(mRS, Element::F64_3(mRS));
+    typeF64_3Builder.setX(1);
+    typeF64_3Builder.setY(2);
+    typeF64_3Builder.setZ(3);
+
+    Type::Builder typeF64_4Builder(mRS, Element::F64_4(mRS));
+    typeF64_4Builder.setX(1);
+    typeF64_4Builder.setY(2);
+    typeF64_4Builder.setZ(3);
+
+    mDouble3Allocation = Allocation::createTyped(mRS, typeF64_3Builder.create());
+    mDouble4Allocation = Allocation::createTyped(mRS, typeF64_4Builder.create());
+}
+
+void initFloatAllocations() {
+    __fp16 *buffer_half = new __fp16[mAllocSize];
+    float *buffer_float = new float[mAllocSize];
+    double *buffer_double = new double[mAllocSize];
+
+    for(int i = 0; i < mAllocSize; ++i) {
+        buffer_half[i] = (__fp16) 1 / i;
+        buffer_float[i] = (float) 1 / i;
+        buffer_double[i] = (double) 1 / i;
+    }
+
+    mHalfAllocation->copy1DRangeFrom(0, mAllocSize, buffer_half);
+    mHalf2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_half);
+    mHalf3Allocation->copy2DRangeFrom(0, 0, 1, 6, buffer_half);
+    mHalf4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_half);
+
+    delete [] buffer_half;
+
+    mFloatAllocation->copy1DRangeFrom(0, mAllocSize, buffer_float);
+    mFloat2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_float);
+    mFloat3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_float);
+    mFloat4Allocation->copy2DRangeFrom(0, 0, 3, 2, buffer_float);
+
+    delete [] buffer_float;
+
+    mDoubleAllocation->copy1DRangeFrom(0, mAllocSize, buffer_double);
+    mDouble2Allocation->copy3DRangeFrom(0, 0, 0, 4, 1, 3, buffer_double);
+    mDouble3Allocation->copy3DRangeFrom(0, 0, 0, 1, 2, 3, buffer_double);
+    mDouble4Allocation->copy3DRangeFrom(0, 0, 0, 1, 2, 3, buffer_double);
+
+    delete [] buffer_double;
+}
+
+int main()
+{
+    mRS = new RS();
+
+    mRS->init("/data/rscache", RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
+
+    sp<ScriptC_allocs> mScript = new ScriptC_allocs(mRS);
+
+    Type::Builder typeRGBA_888Builder(mRS, Element::RGBA_8888(mRS));
+    typeRGBA_888Builder.setX(mBitmapSize);
+    typeRGBA_888Builder.setY(mBitmapSize);
+
+    sp<Allocation> mInAllocation = Allocation::createTyped(mRS, typeRGBA_888Builder.create());
+
+    const int image_area = mBitmapSize*mBitmapSize;
+    const int image_size = image_area*sizeof(int);
+
+    char *zero_buffer = new char[image_size];
+    memset(zero_buffer, 0, image_size);
+    mInAllocation->copy1DRangeFrom(0, image_area, zero_buffer);
+    delete [] zero_buffer;
+
+    sp<Allocation> mOutAllocation = Allocation::createTyped(mRS, typeRGBA_888Builder.create());
+    createSignedAllocations();
+    initSignedAllocations();
+
+    mRS->finish();
+    mScript->forEach_swizzle_kernel(mInAllocation, mOutAllocation);
+    mRS->finish();
+
+    mCharAllocation.clear();
+    mChar2Allocation.clear();
+    mChar3Allocation.clear();
+    mChar4Allocation.clear();
+
+    mShort2Allocation.clear();
+    mShort3Allocation.clear();
+    mShort4Allocation.clear();
+
+    mIntAllocation.clear();
+    mInt2Allocation.clear();
+    mInt3Allocation.clear();
+    mInt4Allocation.clear();
+
+    mLongAllocation.clear();
+    mLong2Allocation.clear();
+    mLong3Allocation.clear();
+    mLong4Allocation.clear();
+
+    mBoolAllocation.clear();
+
+    createUnsignedAllocations();
+    initUnsignedAllocations();
+
+    mInAllocation = mUShortAllocation; // Host side assignment
+
+    mRS->finish();
+    mScript->forEach_square_kernel(mInAllocation, mUIntAllocation);
+    mRS->finish();
+
+    mUCharAllocation.clear();
+    mUChar2Allocation.clear();
+    mUChar3Allocation.clear();
+    mUChar4Allocation.clear();
+
+    mUShortAllocation.clear();
+    mUShort2Allocation.clear();
+    mUShort3Allocation.clear();
+    mUShort4Allocation.clear();
+
+    mUInt2Allocation.clear();
+    mUInt3Allocation.clear();
+    mUInt4Allocation.clear();
+
+    mULongAllocation.clear();
+    mULong2Allocation.clear();
+    mULong3Allocation.clear();
+    mULong4Allocation.clear();
+
+    createFloatAllocations();
+    initFloatAllocations();
+
+    mRS->finish();
+    mScript->forEach_add_half_kernel(mDouble4Allocation, mDouble3Allocation);
+    mRS->finish();
+
+    return 0;
+}
+

diff --git a/tests/lldb/cpp/Allocations/Android.mk b/tests/lldb/cpp/Allocations/Android.mk
new file mode 100644
index 0000000..9f72923
--- /dev/null
+++ b/tests/lldb/cpp/Allocations/Android.mk

@@ -0,0 +1,15 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := CppAllocations
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := \
+	Allocations.cpp \
+	allocs.rscript
+
+LOCAL_STATIC_LIBRARIES := libcompiler_rt
+
+include frameworks/rs/tests/lldb/cpp/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/lldb/cpp/Allocations/allocs.rscript b/tests/lldb/cpp/Allocations/allocs.rscript
new file mode 100644
index 0000000..eff7977
--- /dev/null
+++ b/tests/lldb/cpp/Allocations/allocs.rscript

@@ -0,0 +1,47 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.cppallocations)
+
+// Kernel performs basic vector swizzle
+uchar4 __attribute__((kernel)) swizzle_kernel(uchar4 in)
+{
+    return in.wzyx;
+}
+
+// Kernel squares every element in allocation
+uint __attribute__((kernel)) square_kernel(ushort in)
+{
+    uint result = (uint)in * (uint)in;
+    return result;
+}
+
+// Helper function adding 1/2 to passed in double
+static double half_helper(double in)
+{
+    return (in + 0.5);
+}
+
+// Kernel returns first 3 elements of a double4 plus 1/2
+double3 __attribute__((kernel)) add_half_kernel(double4 in)
+{
+    double3 result;
+    result.x = half_helper(in.x);
+    result.y = half_helper(in.y);
+    result.z = half_helper(in.z);
+    return result;
+}

diff --git a/tests/lldb/cpp/Android.mk b/tests/lldb/cpp/Android.mk
new file mode 100644
index 0000000..8338432
--- /dev/null
+++ b/tests/lldb/cpp/Android.mk

@@ -0,0 +1,2 @@
+include $(call all-subdir-makefiles)
+

diff --git a/tests/lldb/cpp/BranchingFunCalls/Android.mk b/tests/lldb/cpp/BranchingFunCalls/Android.mk
new file mode 100644
index 0000000..c06e6af
--- /dev/null
+++ b/tests/lldb/cpp/BranchingFunCalls/Android.mk

@@ -0,0 +1,13 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := CppBranchingFunCalls
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := \
+	BranchingFunCalls.cpp \
+	scalars.rscript
+
+include frameworks/rs/tests/lldb/cpp/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/lldb/cpp/BranchingFunCalls/BranchingFunCalls.cpp b/tests/lldb/cpp/BranchingFunCalls/BranchingFunCalls.cpp
new file mode 100644
index 0000000..f0eee09
--- /dev/null
+++ b/tests/lldb/cpp/BranchingFunCalls/BranchingFunCalls.cpp

@@ -0,0 +1,55 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <RenderScript.h>
+
+#include "ScriptC_scalars.h"
+
+int main()
+{
+    static const int size = 64;
+    sp<RS> rs = new RS();
+
+    rs->init("/data/rscache", RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
+
+    auto e = Element::I32(rs);
+    Type::Builder tb(rs, e);
+    tb.setX(size);
+    tb.setY(size);
+    auto t = tb.create();
+
+    auto a = Allocation::createTyped(rs, t);
+    auto b = Allocation::createTyped(rs, t);
+
+    int * input = new int[size*size];
+    for(int i = 0; i < size*size; ++i) {
+        input[i] = i - (size*size / 2);
+    }
+    a->copy2DRangeFrom(0, 0, size, size, input);
+    delete [] input;
+
+    // Script is executed once, then the data is copied back when finished
+    sp<ScriptC_scalars> s = new ScriptC_scalars(rs);
+    s->invoke_addToGlobal(234);
+    s->forEach_simple_kernel(a, b);
+    rs->finish();
+    int32_t * output = new int32_t[size*size];
+    b->copy2DRangeTo(0, 0, size, size, output);
+    delete [] output;
+
+    return 0;
+}
+

diff --git a/tests/lldb/cpp/BranchingFunCalls/scalars.rscript b/tests/lldb/cpp/BranchingFunCalls/scalars.rscript
new file mode 100644
index 0000000..279694d
--- /dev/null
+++ b/tests/lldb/cpp/BranchingFunCalls/scalars.rscript

@@ -0,0 +1,76 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.cppbranchingfuncalls)
+
+static bool is_neg(int a)
+{
+    if(a < 0)
+        return true;
+    else
+        return false;
+}
+
+static bool is_pos(int a)
+{
+    if(a > 0)
+        return true;
+    else
+        return false;
+}
+
+static void set_i(int * a, int b)
+{
+    int tmp = b;
+    *a = tmp;
+}
+
+static void modify_f(float * f)
+{
+    *f *= 0.5f;
+}
+
+static void modify_i(int * i)
+{
+    int j = *i;
+    int cutoff = 2 << 6;
+    if(j > cutoff)
+        j = cutoff;
+    if(is_neg(j))
+        set_i(i, 0);
+    else if(is_pos(j))
+        set_i(i, j);
+    else
+        set_i(i, cutoff);
+}
+
+int __attribute__((kernel)) simple_kernel(int in)
+{
+    int i = in;
+    float f = (float) i;
+    modify_f(&f);
+    modify_i(&i);
+    int ret = (int) f;
+    return in * ret;
+}
+
+int glob = 123;
+
+void addToGlobal(int arg)
+{
+    glob += arg;
+}

diff --git a/tests/lldb/cpp/InfiniteLoop/Android.mk b/tests/lldb/cpp/InfiniteLoop/Android.mk
new file mode 100644
index 0000000..86a4ec5
--- /dev/null
+++ b/tests/lldb/cpp/InfiniteLoop/Android.mk

@@ -0,0 +1,13 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := CppInfiniteLoop
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := \
+	InfiniteLoop.cpp \
+	infiniteloop.rscript
+
+include frameworks/rs/tests/lldb/cpp/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/lldb/cpp/InfiniteLoop/InfiniteLoop.cpp b/tests/lldb/cpp/InfiniteLoop/InfiniteLoop.cpp
new file mode 100644
index 0000000..2b55d4e
--- /dev/null
+++ b/tests/lldb/cpp/InfiniteLoop/InfiniteLoop.cpp

@@ -0,0 +1,57 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <thread>
+#include <chrono>
+
+#include <RenderScript.h>
+
+#include "ScriptC_infiniteloop.h"
+
+int main()
+{
+    static const int size = 64;
+    sp<RS> rs = new RS();
+
+    rs->init("/data/rscache", RS_INIT_LOW_LATENCY);
+
+    auto e = Element::RGBA_8888(rs);
+    Type::Builder tb(rs, e);
+    tb.setX(size);
+    tb.setY(size);
+    auto t = tb.create();
+
+    auto a = Allocation::createTyped(rs, t);
+    auto b = Allocation::createTyped(rs, t);
+
+    sp<ScriptC_infiniteloop> s = new ScriptC_infiniteloop(rs);
+
+    // Test is designed to loop forever, waits for two seconds
+    // between each invocation of the kernel
+    bool forever = true;
+    while(forever)
+    {
+        s->forEach_simple_kernel(a, b);
+        std::this_thread::sleep_for(std::chrono::seconds(2));
+    }
+
+    uint32_t * output = new uint32_t[size*size];
+    b->copy2DRangeTo(0, 0, size, size, output);
+    delete [] output;
+
+    return 0;
+}
+

diff --git a/tests/lldb/cpp/InfiniteLoop/infiniteloop.rscript b/tests/lldb/cpp/InfiniteLoop/infiniteloop.rscript
new file mode 100644
index 0000000..7eff95c
--- /dev/null
+++ b/tests/lldb/cpp/InfiniteLoop/infiniteloop.rscript

@@ -0,0 +1,55 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.cppinfiniteloop)
+
+
+float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
+
+/* RenderScript kernel that just sets the colour of the screen and does some
+ * simple operations so it is not completely empty
+ * (and can therefore be debugged).
+ */
+uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
+{
+    float4 out = rsUnpackColor8888(in);
+
+    out.r = gColor.r;
+    out.g = gColor.g;
+    out.b = gColor.b;
+    out.a = gColor.a;
+
+    uchar4 result = rsPackColorTo8888(out);
+    return result;
+}
+

diff --git a/tests/lldb/cpp/KernelVariables/Android.mk b/tests/lldb/cpp/KernelVariables/Android.mk
new file mode 100644
index 0000000..7a68c93
--- /dev/null
+++ b/tests/lldb/cpp/KernelVariables/Android.mk

@@ -0,0 +1,13 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := CppKernelVariables
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := \
+	KernelVariables.cpp \
+	simple.rscript
+
+include frameworks/rs/tests/lldb/cpp/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/lldb/cpp/KernelVariables/KernelVariables.cpp b/tests/lldb/cpp/KernelVariables/KernelVariables.cpp
new file mode 100644
index 0000000..e289005
--- /dev/null
+++ b/tests/lldb/cpp/KernelVariables/KernelVariables.cpp

@@ -0,0 +1,87 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <RenderScript.h>
+
+#include "ScriptC_simple.h"
+
+int main()
+{
+    static const int size = 64;
+    sp<RS> rs = new RS();
+
+    rs->init("/data/rscache", RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
+
+    auto e = Element::RGBA_8888(rs);
+    Type::Builder tb(rs, e);
+    tb.setX(size);
+    tb.setY(size);
+    auto t = tb.create();
+
+    auto a = Allocation::createTyped(rs, t);
+    auto b = Allocation::createTyped(rs, t);
+
+    sp<ScriptC_simple> s = new ScriptC_simple(rs);
+
+    static const int buffer_int[] = {1, 2, 3, 4};
+    sp<Allocation> int_allocation = Allocation::createSized(rs, Element::I32(rs), 4);
+    int_allocation->copy1DRangeFrom(0, 4, buffer_int);
+    s->set_allocation_1D_global(int_allocation);
+
+    static const int buffer_int2[] = {5, 6, 7, 8};
+
+    Type::Builder typeI32Builder2D(rs, Element::I32(rs));
+    typeI32Builder2D.setX(2);
+    typeI32Builder2D.setY(2);
+
+    sp<Allocation> int_allocation2 = Allocation::createTyped(rs, typeI32Builder2D.create());
+    int_allocation2->copy2DRangeFrom(0, 0, 2, 2, buffer_int2);
+    s->set_allocation_1D_global2(int_allocation2);
+
+    s->set_allocation_2D_global(a);
+    s->set_allocation_2D_global2(b);
+
+    static const int buffer_int3[] = {9, 10, 11, 12, 13, 14, 15, 16};
+
+    Type::Builder typeI32Builder3D(rs, Element::I32(rs));
+    typeI32Builder3D.setX(2);
+    typeI32Builder3D.setY(2);
+    typeI32Builder3D.setZ(2);
+
+    sp<Allocation> int_allocation3 = Allocation::createTyped(rs, typeI32Builder3D.create());
+    int_allocation3->copy3DRangeFrom(0, 0, 0, 2, 2, 2, buffer_int3);
+    s->set_allocation_3D_global(int_allocation3);
+
+    Type::Builder yuvTypeBuilder(rs, Element::YUV(rs));
+    yuvTypeBuilder.setX(4);
+    yuvTypeBuilder.setY(4);
+    yuvTypeBuilder.setYuvFormat(RS_YUV_YV12);
+
+    sp<Allocation> yuv_allocation = Allocation::createTyped(rs, yuvTypeBuilder.create());
+    s->set_allocation_YUV_2D_global(yuv_allocation);
+
+    s->set_sampler_global(Sampler::CLAMP_LINEAR(rs));
+
+    // Script is executed once, then the data is copied back when finished
+    s->forEach_kernel(a, b);
+    rs->finish();
+    uint32_t * output = new uint32_t[size*size];
+    b->copy2DRangeTo(0, 0, size, size, output);
+    delete [] output;
+
+    return 0;
+}
+

diff --git a/tests/lldb/cpp/KernelVariables/simple.rscript b/tests/lldb/cpp/KernelVariables/simple.rscript
new file mode 100644
index 0000000..bad675e
--- /dev/null
+++ b/tests/lldb/cpp/KernelVariables/simple.rscript

@@ -0,0 +1,197 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.cppkernelvariables)
+
+char char_global = 12;
+uchar uchar_global = 234;
+short short_global = -321;
+ushort ushort_global = 432;
+int int_global = 1234;
+uint uint_global = 2345;
+float float_global = 4.5f;
+long long_global = -77777;
+ulong ulong_global = 8888;
+double double_global = -456.5f;
+
+char2 char2_global = {11, -22};
+uchar2 uchar2_global = {33, 44};
+short2 short2_global = {-555, 666};
+ushort2 ushort2_global = {777, 888};
+int2 int2_global = {999, -1111};
+uint2 uint2_global = {2222, 3333};
+float2 float2_global = {4.5f, -5.0f};
+long2 long2_global = {-4444, 5555};
+ulong2 ulong2_global = {6666, 7777};
+double2 double2_global = {88.5f, -99.0f};
+
+char3 char3_global = {11, -22, -33};
+uchar3 uchar3_global = {33, 44, 55};
+short3 short3_global = {-555, 666, 777};
+ushort3 ushort3_global = {777, 888, 999};
+int3 int3_global = {999, -1111, 2222};
+uint3 uint3_global = {2222, 3333, 4444};
+float3 float3_global = {4.5f, -5.0f, -6.5f};
+long3 long3_global = {-4444, 5555, 6666};
+ulong3 ulong3_global = {6666, 7777, 8888};
+double3 double3_global = {88.5f, -99.0f, 111.5f};
+
+char4 char4_global = {55, 11, -22, -33};
+uchar4 uchar4_global = {222, 33, 44, 55};
+short4 short4_global = {-444, -555, 666, 777};
+ushort4 ushort4_global = {666, 777, 888, 999};
+int4 int4_global = {888, 999, -1111, 2222};
+uint4 uint4_global = {1111, 2222, 3333, 4444};
+float4 float4_global = {3.0f, 4.5f, -5.0f, -6.5f};
+long4 long4_global = {-3333, -4444, 5555, 6666};
+ulong4 ulong4_global = {5555, 6666, 7777, 8888};
+double4 double4_global = {-77.0f, 88.5f, -99.0f, 111.5f};
+
+rs_matrix2x2 matrix2x2_global;
+rs_matrix3x3 matrix3x3_global;
+rs_matrix4x4 matrix4x4_global;
+
+rs_quaternion quaternion_global;
+
+rs_allocation allocation_1D_global;
+rs_allocation allocation_1D_global2;
+rs_allocation allocation_2D_global;
+rs_allocation allocation_2D_global2;
+rs_allocation allocation_3D_global;
+rs_allocation allocation_YUV_2D_global;
+
+rs_allocation_cubemap_face cubemap_face_global;
+rs_sampler sampler_global;
+
+uchar4 __attribute__((kernel)) kernel(uchar4 in)
+{
+    char char_local = 'a';
+    uchar uchar_local = 'b';
+    short short_local = -321;
+    ushort ushort_local = 432;
+    int int_local = 1234;
+    uint uint_local = 2345;
+    float float_local = 4.5f;
+    long long_local = -77777;
+    ulong ulong_local = 8888;
+    double double_local = -456.5f;
+
+    char2 char2_local = {-11, -22};
+    uchar2 uchar2_local = {33, 44};
+    short2 short2_local = {-555, 666};
+    ushort2 ushort2_local = {777, 888};
+    int2 int2_local = {999, -1111};
+    uint2 uint2_local = {2222, 3333};
+    float2 float2_local = {4.5f, -5.0f};
+    long2 long2_local = {-4444, 5555};
+    ulong2 ulong2_local = {6666, 7777};
+    double2 double2_local = {88.5f, -99.0f};
+
+    char3 char3_local = {11, -22, -33};
+    uchar3 uchar3_local = {33, 44, 55};
+    short3 short3_local = {-555, 666, 777};
+    ushort3 ushort3_local = {777, 888, 999};
+    int3 int3_local = {999, -1111, 2222};
+    uint3 uint3_local = {2222, 3333, 4444};
+    float3 float3_local = {4.5f, -5.0f, -6.5f};
+    long3 long3_local = {-4444, 5555, 6666};
+    ulong3 ulong3_local = {6666, 7777, 8888};
+    double3 double3_local = {88.5f, -99.0f, 111.5f};
+
+    char4 char4_local = {55, 11, -22, -33};
+    uchar4 uchar4_local = {22, 33, 44, 55};
+    short4 short4_local = {-444, -555, 666, 777};
+    ushort4 ushort4_local = {666, 777, 888, 999};
+    int4 int4_local = {888, 999, -1111, 2222};
+    uint4 uint4_local = {1111, 2222, 3333, 4444};
+    float4 float4_local = {3.0f, 4.5f, -5.0f, -6.5f};
+    long4 long4_local = {-3333, -4444, 5555, 6666};
+    ulong4 ulong4_local = {5555, 6666, 7777, 8888};
+    double4 double4_local = {-77.0f, 88.5f, -99.0f, 111.5f};
+
+    rs_matrix2x2 matrix2x2_local = {{1., 2.5,
+                                     3., 4.5}};
+    rs_matrix3x3 matrix3x3_local = {{5., 6.5, 7.,
+                                     8.5, 9., 1.5,
+                                     2., 3.5, 4.}};
+    rs_matrix4x4 matrix4x4_local = {{5.5, 6., 7.5, 8.,
+                                     9., 1.5, 2., 3.5,
+                                     4.5, 5.5, 6.5, 7.,
+                                     8., 9.5, 1.5, 2.5}};
+
+    matrix2x2_global = matrix2x2_local;
+    matrix3x3_global = matrix3x3_local;
+    matrix4x4_global = matrix4x4_local;
+
+    rsQuaternionSet(&quaternion_global, 3.0, 4.5, 5.5, 6.0);
+
+    rs_quaternion quaternion_local;
+    rsQuaternionSet(&quaternion_local, 7.5, 8.0, 9.0, 0.5);
+
+    char char_combined = char_local + (char)uchar_local + char2_local.x +
+        (char)uchar2_local.x + char3_local.x - (char)uchar3_local.x +
+        char4_local.x + (char)uchar4_local.x;
+
+    short short_combined = short_local + (short)ushort_local + short2_local.x +
+        (short)ushort2_local.x + short3_local.x + (short)ushort3_local.x +
+        short4_local.x + (short)ushort4_local.x;
+
+    int int_combined = int_local + (int)uint_local + int2_local.x +
+        (int)uint2_local.x + int3_local.x + (int)uint3_local.x + int4_local.x +
+        (int)uint4_local.x;
+
+    float float_combined = float_local + float2_local.x + float3_local.x +
+        float4_local.x;
+
+    long long_combined = long_local + (long)ulong_local + long2_local.x +
+        (long)ulong2_local.x + long3_local.x + (long)ulong3_local.x +
+        long4_local.x + (long)ulong4_local.x;
+
+    double double_combined = double_local + double2_local.x + double3_local.x +
+        double4_local.x;
+
+    char_global = char_combined;
+    short_global = short_combined;
+    int_global = int_combined;
+    float_global = float_combined;
+    long_global = long_combined;
+    double_global = double_combined;
+
+    uchar4 result = {1,2,3,4};
+    return result;
+}
+
+float use_constants_global;
+
+void setup(void)
+{
+  use_constants_global =
+      M_1_PI +
+      M_2_PI +
+      M_2_PIl +
+      M_2_SQRTPI +
+      M_E +
+      M_LN10 +
+      M_LN2 +
+      M_LOG10E +
+      M_LOG2E +
+      M_PI +
+      M_PI_2 +
+      M_PI_4 +
+      M_SQRT1_2 +
+      M_SQRT2;
+}

diff --git a/tests/lldb/cpp/MultipleRSFiles/Android.mk b/tests/lldb/cpp/MultipleRSFiles/Android.mk
new file mode 100644
index 0000000..d4b5b35
--- /dev/null
+++ b/tests/lldb/cpp/MultipleRSFiles/Android.mk

@@ -0,0 +1,14 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := CppMultipleRSFiles
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := \
+	MultipleRSFiles.cpp \
+	first.rscript \
+	second.rscript
+
+include frameworks/rs/tests/lldb/cpp/common.mk
+include $(BUILD_EXECUTABLE)

diff --git a/tests/lldb/cpp/MultipleRSFiles/MultipleRSFiles.cpp b/tests/lldb/cpp/MultipleRSFiles/MultipleRSFiles.cpp
new file mode 100644
index 0000000..9d0d4e7
--- /dev/null
+++ b/tests/lldb/cpp/MultipleRSFiles/MultipleRSFiles.cpp

@@ -0,0 +1,51 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <RenderScript.h>
+
+#include "ScriptC_first.h"
+#include "ScriptC_second.h"
+
+int main()
+{
+    static const int size = 64;
+    sp<RS> rs = new RS();
+
+    rs->init("/data/rscache", RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
+
+    auto e = Element::RGBA_8888(rs);
+    Type::Builder tb(rs, e);
+    tb.setX(size);
+    tb.setY(size);
+    auto t = tb.create();
+
+    auto a = Allocation::createTyped(rs, t);
+    auto b = Allocation::createTyped(rs, t);
+
+    // Script is executed once, then the data is copied back when finished
+    sp<ScriptC_first> s1 = new ScriptC_first(rs);
+    sp<ScriptC_second> s2 = new ScriptC_second(rs);
+
+    s1->forEach_first_kernel(a, b);
+    uint32_t * output = new uint32_t[size*size];
+    b->copy2DRangeTo(0, 0, size, size, output);
+    delete [] output;
+
+    s2->forEach_second_kernel(a, b);
+
+    rs->finish();
+    return 0;
+}

diff --git a/tests/lldb/cpp/MultipleRSFiles/first.rscript b/tests/lldb/cpp/MultipleRSFiles/first.rscript
new file mode 100644
index 0000000..2f84c46
--- /dev/null
+++ b/tests/lldb/cpp/MultipleRSFiles/first.rscript

@@ -0,0 +1,32 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.cppmultiplersfiles)
+
+float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
+
+/* RenderScript kernel that just sets the colour of the screen and does some
+ * simple operations so it is not completely empty
+ * (and can therefore be debugged).
+ */
+uchar4 __attribute__((kernel)) first_kernel(uchar4 in)
+{
+    float4 temp = rsUnpackColor8888(in);
+    temp = gColor;
+    uchar4 result = rsPackColorTo8888(temp);
+    return result;
+}

diff --git a/tests/lldb/cpp/MultipleRSFiles/second.rscript b/tests/lldb/cpp/MultipleRSFiles/second.rscript
new file mode 100644
index 0000000..0279d0f
--- /dev/null
+++ b/tests/lldb/cpp/MultipleRSFiles/second.rscript

@@ -0,0 +1,25 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.cppmultiplersfiles)
+
+/* RenderScript kernel that just returns the swizzled input. */
+uchar4 __attribute__((kernel)) second_kernel(uchar4 in)
+{
+    uchar4 result = in.wzyx;
+    return result;
+}

diff --git a/tests/lldb/cpp/WaitAttach/Android.mk b/tests/lldb/cpp/WaitAttach/Android.mk
new file mode 100644
index 0000000..00bf745
--- /dev/null
+++ b/tests/lldb/cpp/WaitAttach/Android.mk

@@ -0,0 +1,27 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := CppDebugWaitAttach
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := \
+	WaitAttach.cpp \
+	simple.rscript
+
+include frameworks/rs/tests/lldb/cpp/common.mk
+include $(BUILD_EXECUTABLE)
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := CppNoDebugWaitAttach
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := \
+	WaitAttach.cpp \
+	simple.rscript
+
+include frameworks/rs/tests/lldb/cpp/common.mk
+LOCAL_RENDERSCRIPT_FLAGS := $(filter-out -g,$(LOCAL_RENDERSCRIPT_FLAGS))
+include $(BUILD_EXECUTABLE)

diff --git a/tests/lldb/cpp/WaitAttach/WaitAttach.cpp b/tests/lldb/cpp/WaitAttach/WaitAttach.cpp
new file mode 100644
index 0000000..c6c1980
--- /dev/null
+++ b/tests/lldb/cpp/WaitAttach/WaitAttach.cpp

@@ -0,0 +1,48 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <RenderScript.h>
+
+#include "ScriptC_simple.h"
+
+int main()
+{
+    static const int size = 8;
+    sp<RS> rs = new RS();
+
+    rs->init("/data/rscache", RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
+
+    auto e = Element::RGBA_8888(rs);
+    Type::Builder tb(rs, e);
+    tb.setX(size);
+    tb.setY(size);
+    auto t = tb.create();
+
+    auto a = Allocation::createTyped(rs, t);
+    auto b = Allocation::createTyped(rs, t);
+
+    // Script is executed once, then the data is copied back when finished
+    sp<ScriptC_simple> s = new ScriptC_simple(rs);
+    s->forEach_simple_kernel(a, b);
+    uint32_t * output = new uint32_t[size*size];
+    b->copy2DRangeTo(0, 0, size, size, output);
+    delete [] output;
+
+    s->forEach_other_kernel(a, b);
+
+    rs->finish();
+    return 0;
+}

diff --git a/tests/lldb/cpp/WaitAttach/simple.rscript b/tests/lldb/cpp/WaitAttach/simple.rscript
new file mode 100644
index 0000000..1e9780a
--- /dev/null
+++ b/tests/lldb/cpp/WaitAttach/simple.rscript

@@ -0,0 +1,39 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.cppwaitattach)
+
+float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
+
+/* RenderScript kernel that just sets the colour of the screen and does some
+ * simple operations so it is not completely empty
+ * (and can therefore be debugged).
+ */
+uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
+{
+    float4 temp = rsUnpackColor8888(in);
+    temp = gColor;
+    uchar4 result = rsPackColorTo8888(temp);
+    return result;
+}
+
+// Extra kernel to test lldb setting breakpoints on all the RS kernels.
+uchar4 __attribute__((kernel)) other_kernel(uchar4 in)
+{
+    uchar4 result = in.wzyx;
+    return result;
+}

diff --git a/tests/lldb/cpp/common.mk b/tests/lldb/cpp/common.mk
new file mode 100644
index 0000000..3b65741
--- /dev/null
+++ b/tests/lldb/cpp/common.mk

@@ -0,0 +1,11 @@
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_RENDERSCRIPT_FLAGS += -g -O0 -target-api 0
+LOCAL_CFLAGS := -Werror -Wall -Wextra -std=c++11
+LOCAL_LDFLAGS += -llog
+
+LOCAL_STATIC_LIBRARIES += libRScpp_static
+
+intermediates += $(call intermediates-dir-for,STATIC_LIBRARIES,libRS,TARGET,)
+
+LOCAL_C_INCLUDES += $(intermediates)

diff --git a/tests/lldb/java/Allocations/Android.mk b/tests/lldb/java/Allocations/Android.mk
new file mode 100644
index 0000000..cff322f
--- /dev/null
+++ b/tests/lldb/java/Allocations/Android.mk

@@ -0,0 +1,17 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := \
+	$(call all-java-files-under, src) \
+	$(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := Allocations
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_PRIVATE_PLATFORM_APIS := true
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -Wno-unused -target-api 0
+
+include $(BUILD_PACKAGE)

diff --git a/toolkit/test/AndroidManifest.xml b/tests/lldb/java/Allocations/AndroidManifest.xml
similarity index 61%
copy from toolkit/test/AndroidManifest.xml
copy to tests/lldb/java/Allocations/AndroidManifest.xml
index f709790..8650cb0 100644
--- a/toolkit/test/AndroidManifest.xml
+++ b/tests/lldb/java/Allocations/AndroidManifest.xml

@@ -1,17 +1,15 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-        package="com.example.testapp">
-
-    <application
-            android:allowBackup="true"
-            android:label="Toolkit Test"
-            android:supportsRtl="true">
-        <activity android:name=".MainActivity">
+    package="com.android.rs.allocations">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="Allocations"
+                 android:hardwareAccelerated="true">
+        <activity android:name="MainActivity">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
-
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>
     </application>
 </manifest>
+

diff --git a/tests/lldb/java/Allocations/res/layout/main_layout.xml b/tests/lldb/java/Allocations/res/layout/main_layout.xml
new file mode 100644
index 0000000..131c3b5
--- /dev/null
+++ b/tests/lldb/java/Allocations/res/layout/main_layout.xml

@@ -0,0 +1,15 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>
+

diff --git a/tests/lldb/java/Allocations/src/com/android/rs/allocations/MainActivity.java b/tests/lldb/java/Allocations/src/com/android/rs/allocations/MainActivity.java
new file mode 100644
index 0000000..f06f2b9
--- /dev/null
+++ b/tests/lldb/java/Allocations/src/com/android/rs/allocations/MainActivity.java

@@ -0,0 +1,552 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.allocations;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.graphics.Bitmap;
+import android.renderscript.*;
+
+public class MainActivity extends Activity {
+    private RenderScript mRS;
+
+    private Allocation mInAllocation;      // script input
+    private Allocation mOutAllocation;     // script output
+
+    private Allocation mStructInAlloc;     // complexStruct input
+    private Allocation mStructOutAlloc;    // complexStruct output
+
+    private Allocation mBoolAllocation;    // boolean
+
+    private Allocation mCharAllocation;    // char
+    private Allocation mChar2Allocation;   // char2
+    private Allocation mChar3Allocation;   // char3
+    private Allocation mChar4Allocation;   // char4
+
+    private Allocation mUCharAllocation;   // uchar
+    private Allocation mUChar2Allocation;  // uchar2
+    private Allocation mUChar3Allocation;  // uchar3
+    private Allocation mUChar4Allocation;  // uchar4
+
+    private Allocation mShortAllocation;   // short
+    private Allocation mShort2Allocation;  // short2
+    private Allocation mShort3Allocation;  // short3
+    private Allocation mShort4Allocation;  // short4
+
+    private Allocation mUShortAllocation;  // ushort
+    private Allocation mUShort2Allocation; // ushort2
+    private Allocation mUShort3Allocation; // ushort3
+    private Allocation mUShort4Allocation; // ushort4
+
+    private Allocation mIntAllocation;     // int
+    private Allocation mInt2Allocation;    // int2
+    private Allocation mInt3Allocation;    // int3
+    private Allocation mInt4Allocation;    // int4
+
+    private Allocation mUIntAllocation;    // uint
+    private Allocation mUInt2Allocation;   // uint2
+    private Allocation mUInt3Allocation;   // uint3
+    private Allocation mUInt4Allocation;   // uint4
+
+    private Allocation mLongAllocation;    // long
+    private Allocation mLong2Allocation;   // long2
+    private Allocation mLong3Allocation;   // long3
+    private Allocation mLong4Allocation;   // long4
+
+    private Allocation mULongAllocation;   // ulong
+    private Allocation mULong2Allocation;  // ulong2
+    private Allocation mULong3Allocation;  // ulong3
+    private Allocation mULong4Allocation;  // ulong4
+
+    private Allocation mHalfAllocation;    // half
+    private Allocation mHalf2Allocation;   // half2
+    private Allocation mHalf3Allocation;   // half3
+    private Allocation mHalf4Allocation;   // half4
+
+    private Allocation mFloatAllocation;   // float
+    private Allocation mFloat2Allocation;  // float2
+    private Allocation mFloat3Allocation;  // float3
+    private Allocation mFloat4Allocation;  // float4
+
+    private Allocation mDoubleAllocation;  // double
+    private Allocation mDouble2Allocation; // double2
+    private Allocation mDouble3Allocation; // double3
+    private Allocation mDouble4Allocation; // double4
+
+    private ScriptC_allocs mScript;
+
+    private int mAllocSize = 24; // Chosen as allocation size since it's easily divisible
+
+    private Bitmap mBitmapIn;
+    private Bitmap mBitmapOut;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.main_layout);
+
+        mBitmapIn = Bitmap.createBitmap(64, 64, Bitmap.Config.ARGB_8888);
+        mBitmapOut = Bitmap.createBitmap(mBitmapIn.getWidth(), mBitmapIn.getHeight(), mBitmapIn.getConfig());
+
+        createScript();
+        runScript();
+    }
+
+    private void createScript() {
+        mRS = RenderScript.create(this,
+            RenderScript.ContextType.NORMAL,
+            RenderScript.CREATE_FLAG_LOW_LATENCY |
+            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
+
+        mScript = new ScriptC_allocs(mRS);
+    }
+
+    private void createSignedAllocations() {
+        Type.Builder typeI8Builder = new Type.Builder(mRS, Element.I8(mRS));
+        typeI8Builder.setX(1); // One element here to test 16 byte memory alignment
+        typeI8Builder.setY(3);
+        typeI8Builder.setZ(8);
+
+        mCharAllocation = Allocation.createTyped(mRS, typeI8Builder.create());
+        mRS.finish();
+        mChar2Allocation = Allocation.createSized(mRS, Element.I8_2(mRS), mAllocSize / 2);
+        mRS.finish();
+        mChar3Allocation = Allocation.createSized(mRS, Element.I8_3(mRS), mAllocSize / 4);
+        mRS.finish();
+        mChar4Allocation = Allocation.createSized(mRS, Element.I8_4(mRS), mAllocSize / 4);
+        mRS.finish();
+
+        Type.Builder typeI16_2Builder = new Type.Builder(mRS, Element.I16_2(mRS));
+        typeI16_2Builder.setX(6);
+        typeI16_2Builder.setY(1);
+        typeI16_2Builder.setZ(2);
+
+        mShortAllocation = Allocation.createSized(mRS, Element.I16(mRS), mAllocSize);
+        mRS.finish();
+        mShort2Allocation = Allocation.createTyped(mRS, typeI16_2Builder.create());
+        mRS.finish();
+        mShort3Allocation = Allocation.createSized(mRS, Element.I16_3(mRS), mAllocSize / 4);
+        mRS.finish();
+        mShort4Allocation = Allocation.createSized(mRS, Element.I16_4(mRS), mAllocSize / 4);
+        mRS.finish();
+
+        Type.Builder typeI32_3Builder = new Type.Builder(mRS, Element.I32_3(mRS));
+        typeI32_3Builder.setX(3);
+        typeI32_3Builder.setY(2);
+
+        mIntAllocation = Allocation.createSized(mRS, Element.I32(mRS), mAllocSize);
+        mRS.finish();
+        mInt2Allocation = Allocation.createSized(mRS, Element.I32_2(mRS), mAllocSize / 2);
+        mRS.finish();
+        mInt3Allocation = Allocation.createTyped(mRS, typeI32_3Builder.create());
+        mRS.finish();
+        mInt4Allocation = Allocation.createSized(mRS, Element.I32_4(mRS), mAllocSize / 4);
+        mRS.finish();
+
+        Type.Builder typeI64_4Builder = new Type.Builder(mRS, Element.I64_4(mRS));
+        typeI64_4Builder.setX(1);
+        typeI64_4Builder.setY(6);
+
+        mLongAllocation = Allocation.createSized(mRS, Element.I64(mRS), mAllocSize);
+        mRS.finish();
+        mLong2Allocation = Allocation.createSized(mRS, Element.I64_2(mRS), mAllocSize / 2);
+        mRS.finish();
+        mLong3Allocation = Allocation.createSized(mRS, Element.I64_3(mRS), mAllocSize / 4);
+        mRS.finish();
+        mLong4Allocation = Allocation.createTyped(mRS, typeI64_4Builder.create());
+        mRS.finish();
+
+        mBoolAllocation = Allocation.createSized(mRS, Element.BOOLEAN(mRS), mAllocSize);
+        mRS.finish();
+    }
+
+    private void initSignedAllocations() {
+        byte[] buffer_char = new byte[mAllocSize];
+        short[] buffer_short = new short[mAllocSize];
+        int[] buffer_int = new int[mAllocSize];
+        long[] buffer_long = new long[mAllocSize];
+        byte[] buffer_bool = new byte[mAllocSize];
+
+        for(int i = 0; i < mAllocSize; ++i) {
+            buffer_char[i] = (byte) i;
+            buffer_short[i] = (short) i;
+            buffer_int[i] = (int) i;
+            buffer_long[i] = (long) i;
+            buffer_bool[i] =  (byte) (0x01 & i);
+        }
+
+        mCharAllocation.copyFrom(buffer_char);
+        mChar2Allocation.copyFrom(buffer_char);
+        mChar3Allocation.copyFrom(buffer_char);
+        mChar4Allocation.copyFrom(buffer_char);
+
+        mShortAllocation.copyFrom(buffer_short);
+        mShort2Allocation.copyFrom(buffer_short);
+        mShort3Allocation.copyFrom(buffer_short);
+        mShort4Allocation.copyFrom(buffer_short);
+
+        mIntAllocation.copyFrom(buffer_int);
+        mInt2Allocation.copyFrom(buffer_int);
+        mInt3Allocation.copyFrom(buffer_int);
+        mInt4Allocation.copyFrom(buffer_int);
+
+        mLongAllocation.copyFrom(buffer_long);
+        mLong2Allocation.copyFrom(buffer_long);
+        mLong3Allocation.copyFrom(buffer_long);
+        mLong4Allocation.copyFrom(buffer_long);
+
+        mBoolAllocation.copyFromUnchecked(buffer_bool);
+    }
+
+    private void createUnsignedAllocations() {
+        Type.Builder typeU8_2Builder = new Type.Builder(mRS, Element.U8_2(mRS));
+        typeU8_2Builder.setX(2);
+        typeU8_2Builder.setY(6);
+
+        mUCharAllocation = Allocation.createSized(mRS, Element.U8(mRS), mAllocSize);
+        mRS.finish();
+        mUChar2Allocation = Allocation.createTyped(mRS, typeU8_2Builder.create());
+        mRS.finish();
+        mUChar3Allocation = Allocation.createSized(mRS, Element.U8_3(mRS), mAllocSize / 4);
+        mRS.finish();
+        mUChar4Allocation = Allocation.createSized(mRS, Element.U8_4(mRS), mAllocSize / 4);
+        mRS.finish();
+
+        Type.Builder typeU16_3Builder = new Type.Builder(mRS, Element.U16_3(mRS));
+        typeU16_3Builder.setX(1);
+        typeU16_3Builder.setY(6);
+
+        mUShortAllocation = Allocation.createSized(mRS, Element.U16(mRS), mAllocSize);
+        mRS.finish();
+        mUShort2Allocation = Allocation.createSized(mRS, Element.U16_2(mRS), mAllocSize / 2);
+        mRS.finish();
+        mUShort3Allocation = Allocation.createTyped(mRS, typeU16_3Builder.create());
+        mRS.finish();
+        mUShort4Allocation = Allocation.createSized(mRS, Element.U16_4(mRS), mAllocSize / 4);
+        mRS.finish();
+
+        Type.Builder typeU32_4Builder = new Type.Builder(mRS, Element.U32_4(mRS));
+        typeU32_4Builder.setX(1);
+        typeU32_4Builder.setY(1);
+        typeU32_4Builder.setZ(6);
+
+        mUIntAllocation = Allocation.createSized(mRS, Element.U32(mRS), mAllocSize);
+        mRS.finish();
+        mUInt2Allocation = Allocation.createSized(mRS, Element.U32_2(mRS), mAllocSize / 2);
+        mRS.finish();
+        mUInt3Allocation = Allocation.createSized(mRS, Element.U32_3(mRS), mAllocSize / 4);
+        mRS.finish();
+        mUInt4Allocation = Allocation.createTyped(mRS, typeU32_4Builder.create());
+        mRS.finish();
+
+        Type.Builder typeU64Builder = new Type.Builder(mRS, Element.U64(mRS));
+        typeU64Builder.setX(4);
+        typeU64Builder.setY(3);
+        typeU64Builder.setZ(2);
+
+        mULongAllocation = Allocation.createTyped(mRS, typeU64Builder.create());
+        mRS.finish();
+        mULong2Allocation = Allocation.createSized(mRS, Element.U64_2(mRS), mAllocSize / 2);
+        mRS.finish();
+        mULong3Allocation = Allocation.createSized(mRS, Element.U64_3(mRS), mAllocSize / 4);
+        mRS.finish();
+        mULong4Allocation = Allocation.createSized(mRS, Element.U64_4(mRS), mAllocSize / 4);
+        mRS.finish();
+    }
+
+    private void initUnsignedAllocations() {
+        byte[] buffer_char = new byte[mAllocSize];
+        short[] buffer_short = new short[mAllocSize];
+        int[] buffer_int = new int[mAllocSize];
+        long[] buffer_long = new long[mAllocSize];
+
+        for(int i = 0; i < mAllocSize; ++i) {
+            buffer_char[i] = (byte) i;
+            buffer_short[i] = (short) i;
+            buffer_int[i] = (int) i;
+            buffer_long[i] = (long) i;
+        }
+
+        mUCharAllocation.copyFrom(buffer_char);
+        mUChar2Allocation.copyFrom(buffer_char);
+        mUChar3Allocation.copyFrom(buffer_char);
+        mUChar4Allocation.copyFrom(buffer_char);
+
+        mUShortAllocation.copyFrom(buffer_short);
+        mUShort2Allocation.copyFrom(buffer_short);
+        mUShort3Allocation.copyFrom(buffer_short);
+        mUShort4Allocation.copyFrom(buffer_short);
+
+        mUIntAllocation.copyFrom(buffer_int);
+        mUInt2Allocation.copyFrom(buffer_int);
+        mUInt3Allocation.copyFrom(buffer_int);
+        mUInt4Allocation.copyFrom(buffer_int);
+
+        mULongAllocation.copyFrom(buffer_long);
+        mULong2Allocation.copyFrom(buffer_long);
+        mULong3Allocation.copyFrom(buffer_long);
+        mULong4Allocation.copyFrom(buffer_long);
+    }
+
+    private void createFloatAllocations() {
+        Type.Builder typeF16_3Builder = new Type.Builder(mRS, Element.F16_3(mRS));
+        typeF16_3Builder.setX(1);
+        typeF16_3Builder.setY(6);
+
+        mHalfAllocation = Allocation.createSized(mRS, Element.F16(mRS), mAllocSize);
+        mRS.finish();
+        mHalf2Allocation = Allocation.createSized(mRS, Element.F16_2(mRS), mAllocSize / 2);
+        mRS.finish();
+        mHalf3Allocation = Allocation.createTyped(mRS, typeF16_3Builder.create());
+        mRS.finish();
+        mHalf4Allocation = Allocation.createSized(mRS, Element.F16_4(mRS), mAllocSize / 4);
+        mRS.finish();
+
+        Type.Builder typeF32_4Builder = new Type.Builder(mRS, Element.F32_4(mRS));
+        typeF32_4Builder.setX(3);
+        typeF32_4Builder.setY(2);
+
+        mFloatAllocation = Allocation.createSized(mRS, Element.F32(mRS), mAllocSize);
+        mRS.finish();
+        mFloat2Allocation = Allocation.createSized(mRS, Element.F32_2(mRS), mAllocSize / 2);
+        mRS.finish();
+        mFloat3Allocation = Allocation.createSized(mRS, Element.F32_3(mRS), mAllocSize / 4);
+        mRS.finish();
+        mFloat4Allocation = Allocation.createTyped(mRS, typeF32_4Builder.create());
+        mRS.finish();
+
+        Type.Builder typeF64_2Builder = new Type.Builder(mRS, Element.F64_2(mRS));
+        typeF64_2Builder.setX(4);
+        typeF64_2Builder.setY(1);
+        typeF64_2Builder.setZ(3);
+
+        mDoubleAllocation = Allocation.createSized(mRS, Element.F64(mRS), mAllocSize);
+        mRS.finish();
+        mDouble2Allocation = Allocation.createTyped(mRS, typeF64_2Builder.create());
+        mRS.finish();
+
+        Type.Builder typeF64_3Builder = new Type.Builder(mRS, Element.F64_3(mRS));
+        typeF64_3Builder.setX(1);
+        typeF64_3Builder.setY(2);
+        typeF64_3Builder.setZ(3);
+
+        Type.Builder typeF64_4Builder = new Type.Builder(mRS, Element.F64_4(mRS));
+        typeF64_4Builder.setX(1);
+        typeF64_4Builder.setY(2);
+        typeF64_4Builder.setZ(3);
+
+        mDouble3Allocation = Allocation.createTyped(mRS, typeF64_3Builder.create());
+        mRS.finish();
+        mDouble4Allocation = Allocation.createTyped(mRS, typeF64_4Builder.create());
+        mRS.finish();
+    }
+
+    private void initFloatAllocations() {
+        // No java type for half precision float, so bitcast 16-bit int
+        short[] buffer_half = new short[mAllocSize];
+        float[] buffer_float = new float[mAllocSize];
+        double[] buffer_double = new double[mAllocSize];
+
+        for(int i = 0; i < mAllocSize; ++i) {
+            // Construct IEEE 754 half with increasing fraction.
+            byte mantissa = (byte)(i);
+            byte exponent = 0b00111100; // keep exponent constant at one
+            buffer_half[i] = (short)((exponent << 8) | mantissa);
+
+            buffer_float[i] = (float) 1 / i;
+            buffer_double[i] = (double) 1 / i;
+        }
+
+        mHalfAllocation.copyFromUnchecked(buffer_half);
+        mHalf2Allocation.copyFromUnchecked(buffer_half);
+        mHalf3Allocation.copyFromUnchecked(buffer_half);
+        mHalf4Allocation.copyFromUnchecked(buffer_half);
+
+        mFloatAllocation.copyFrom(buffer_float);
+        mFloat2Allocation.copyFrom(buffer_float);
+        mFloat3Allocation.copyFrom(buffer_float);
+        mFloat4Allocation.copyFrom(buffer_float);
+
+        mDoubleAllocation.copyFrom(buffer_double);
+        mDouble2Allocation.copyFrom(buffer_double);
+        mDouble3Allocation.copyFrom(buffer_double);
+        mDouble4Allocation.copyFrom(buffer_double);
+    }
+
+    private void createStructAllocations() {
+        ScriptField_complexStruct complex_struct;
+
+        complex_struct = new ScriptField_complexStruct(mRS, mAllocSize);
+        mRS.finish();
+        mScript.bind_g_complexStruct_in(complex_struct);
+        mRS.finish();
+        mStructInAlloc = complex_struct.getAllocation();
+        mRS.finish();
+
+        complex_struct = new ScriptField_complexStruct(mRS, mAllocSize);
+        mRS.finish();
+        mScript.bind_g_complexStruct_out(complex_struct);
+        mRS.finish();
+        mStructOutAlloc = complex_struct.getAllocation();
+        mRS.finish();
+    }
+
+    private void overwriteFloatAllocations() {
+        float[] buffer_float = new float[mAllocSize];
+
+        // Set float allocations to -1/n
+        for(int i = 0; i < mAllocSize; ++i) {
+            buffer_float[i] = -1f / i;
+        }
+
+        mFloatAllocation.copyFrom(buffer_float);
+        mFloat2Allocation.copyFrom(buffer_float);
+        mFloat3Allocation.copyFrom(buffer_float);
+        mFloat4Allocation.copyFrom(buffer_float);
+    }
+
+    private void runScript() {
+        mInAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
+        mRS.finish();
+        mOutAllocation = Allocation.createFromBitmap(mRS, mBitmapOut);
+        mRS.finish();
+
+
+        createSignedAllocations();
+        initSignedAllocations();
+
+        mRS.finish();
+        mScript.forEach_swizzle_kernel(mInAllocation, mOutAllocation);
+        mRS.finish();
+
+        mOutAllocation.copyTo(mBitmapOut);
+
+        mCharAllocation.destroy();
+        mRS.finish();
+        mChar2Allocation.destroy();
+        mRS.finish();
+        mChar3Allocation.destroy();
+        mRS.finish();
+        mChar4Allocation.destroy();
+        mRS.finish();
+
+        mShort2Allocation.destroy();
+        mRS.finish();
+        mShort3Allocation.destroy();
+        mRS.finish();
+        mShort4Allocation.destroy();
+        mRS.finish();
+
+        mIntAllocation.destroy();
+        mRS.finish();
+        mInt2Allocation.destroy();
+        mRS.finish();
+        mInt3Allocation.destroy();
+        mRS.finish();
+        mInt4Allocation.destroy();
+        mRS.finish();
+
+        mLongAllocation.destroy();
+        mRS.finish();
+        mLong2Allocation.destroy();
+        mRS.finish();
+        mLong3Allocation.destroy();
+        mRS.finish();
+        mLong4Allocation.destroy();
+        mRS.finish();
+
+        mBoolAllocation.destroy();
+        mRS.finish();
+
+
+        createUnsignedAllocations();
+        initUnsignedAllocations();
+
+        mInAllocation = mUShortAllocation; // Host side assignment
+
+        mRS.finish();
+        mScript.forEach_square_kernel(mInAllocation, mUIntAllocation);
+        mRS.finish();
+
+        mUCharAllocation.destroy();
+        mRS.finish();
+        mUChar2Allocation.destroy();
+        mRS.finish();
+        mUChar3Allocation.destroy();
+        mRS.finish();
+        mUChar4Allocation.destroy();
+        mRS.finish();
+
+        mUShortAllocation.destroy();
+        mRS.finish();
+        mUShort2Allocation.destroy();
+        mRS.finish();
+        mUShort3Allocation.destroy();
+        mRS.finish();
+        mUShort4Allocation.destroy();
+        mRS.finish();
+
+        mUInt2Allocation.destroy();
+        mRS.finish();
+        mUInt3Allocation.destroy();
+        mRS.finish();
+        mUInt4Allocation.destroy();
+        mRS.finish();
+
+        mULongAllocation.destroy();
+        mRS.finish();
+        mULong2Allocation.destroy();
+        mRS.finish();
+        mULong3Allocation.destroy();
+        mRS.finish();
+        mULong4Allocation.destroy();
+        mRS.finish();
+
+
+        createFloatAllocations();
+        initFloatAllocations();
+
+        mRS.finish();
+        mScript.forEach_add_half_kernel(mDouble4Allocation, mDouble3Allocation);
+        mRS.finish();
+
+        mHalfAllocation.destroy();
+        mRS.finish();
+        mHalf2Allocation.destroy();
+        mRS.finish();
+        mHalf3Allocation.destroy();
+        mRS.finish();
+        mHalf4Allocation.destroy();
+        mRS.finish();
+
+        mDoubleAllocation.destroy();
+        mRS.finish();
+        mDouble2Allocation.destroy();
+        mRS.finish();
+        mDouble4Allocation.destroy();
+        mRS.finish();
+
+        overwriteFloatAllocations();
+
+        createStructAllocations();
+
+        mRS.finish();
+        mScript.forEach_struct_kernel(mStructInAlloc, mStructOutAlloc);
+        mRS.finish();
+    }
+}

diff --git a/tests/lldb/java/Allocations/src/rs/allocs.rscript b/tests/lldb/java/Allocations/src/rs/allocs.rscript
new file mode 100644
index 0000000..0ec39c5
--- /dev/null
+++ b/tests/lldb/java/Allocations/src/rs/allocs.rscript

@@ -0,0 +1,82 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.allocations)
+
+struct simpleStruct {
+   int i;
+   unsigned int j;
+};
+
+struct complexStruct {
+    struct simpleStruct s;
+    uchar4 c;
+    float f[2];
+};
+
+struct complexStruct *g_complexStruct_in;
+struct complexStruct *g_complexStruct_out;
+
+// Kernel performs basic vector swizzle
+uchar4 __attribute__((kernel)) swizzle_kernel(uchar4 in)
+{
+    return in.wzyx;
+}
+
+// Kernel squares every element in allocation
+uint __attribute__((kernel)) square_kernel(ushort in)
+{
+    uint result = (uint)(in) * (uint)in;
+    return result;
+}
+
+// Helper function adding 1/2 to passed in double
+static double half_helper(double in)
+{
+    return (in + 0.5);
+}
+
+// Kernel returns first 3 elements of a double4 plus 1/2
+double3 __attribute__((kernel)) add_half_kernel(double4 in)
+{
+    double3 result;
+    result.x = half_helper(in.x);
+    result.y = half_helper(in.y);
+    result.z = half_helper(in.z);
+    return result;
+}
+
+// Kernel for testing structs
+struct complexStruct __attribute__((kernel))
+struct_kernel(struct complexStruct in, uint32_t x)
+{
+   struct complexStruct complex_out;
+   struct simpleStruct simple_out;
+   simple_out.i = (int) x;
+   simple_out.j = x;
+   complex_out.s = simple_out;
+
+   complex_out.f[0] = (float) x;
+   complex_out.f[1] = (float) x + 0.5;
+
+   complex_out.c.x = (uchar) (x % 128);
+   complex_out.c.y = 'A';
+   complex_out.c.z = 'B';
+   complex_out.c.w = 'C';
+
+   return complex_out;
+}

diff --git a/tests/lldb/java/Android.mk b/tests/lldb/java/Android.mk
new file mode 100644
index 0000000..5053e7d
--- /dev/null
+++ b/tests/lldb/java/Android.mk

@@ -0,0 +1 @@
+include $(call all-subdir-makefiles)

diff --git a/tests/lldb/java/BranchingFunCalls/Android.mk b/tests/lldb/java/BranchingFunCalls/Android.mk
new file mode 100644
index 0000000..94f9d6f
--- /dev/null
+++ b/tests/lldb/java/BranchingFunCalls/Android.mk

@@ -0,0 +1,17 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := \
+	$(call all-java-files-under, src) \
+	$(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := BranchingFunCalls
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_PRIVATE_PLATFORM_APIS := true
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
+
+include $(BUILD_PACKAGE)

diff --git a/tests/lldb/java/BranchingFunCalls/AndroidManifest.xml b/tests/lldb/java/BranchingFunCalls/AndroidManifest.xml
new file mode 100644
index 0000000..fb83a04
--- /dev/null
+++ b/tests/lldb/java/BranchingFunCalls/AndroidManifest.xml

@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.android.rs.branchingfuncalls">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="BranchingFunCalls"
+                 android:hardwareAccelerated="true">
+        <activity android:name="MainActivity">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+</manifest>
+

diff --git a/tests/lldb/java/BranchingFunCalls/res/layout/main_layout.xml b/tests/lldb/java/BranchingFunCalls/res/layout/main_layout.xml
new file mode 100644
index 0000000..131c3b5
--- /dev/null
+++ b/tests/lldb/java/BranchingFunCalls/res/layout/main_layout.xml

@@ -0,0 +1,15 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>
+

diff --git a/tests/lldb/java/BranchingFunCalls/src/com/android/rs/branchingfuncalls/MainActivity.java b/tests/lldb/java/BranchingFunCalls/src/com/android/rs/branchingfuncalls/MainActivity.java
new file mode 100644
index 0000000..a2c1f82
--- /dev/null
+++ b/tests/lldb/java/BranchingFunCalls/src/com/android/rs/branchingfuncalls/MainActivity.java

@@ -0,0 +1,63 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.branchingfuncalls;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.renderscript.*;
+
+public class MainActivity extends Activity {
+    private RenderScript mRS;
+    private Allocation mInAllocation;
+    private Allocation mOutAllocation;
+    private ScriptC_scalars mScript;
+    private int mAllocSize = 256;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.main_layout);
+        createScript();
+        runScript();
+    }
+
+    private void createScript() {
+        mRS = RenderScript.create(this,
+            RenderScript.ContextType.NORMAL,
+            RenderScript.CREATE_FLAG_LOW_LATENCY |
+            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
+
+        Element e = Element.I32(mRS);
+        mInAllocation = Allocation.createSized(mRS, e, mAllocSize);
+        mOutAllocation = Allocation.createSized(mRS, e, mAllocSize);
+
+        mScript = new ScriptC_scalars(mRS);
+    }
+
+    private void runScript() {
+        mScript.invoke_addToGlobal(234);
+
+        int[] init = new int[mAllocSize];
+        for(int i = 0; i < mAllocSize; ++i) {
+            init[i] = i - (mAllocSize / 2);
+        }
+        mInAllocation.copy1DRangeFrom(0, mAllocSize, init);
+        mScript.forEach_simple_kernel(mInAllocation, mOutAllocation);
+    }
+}
+

diff --git a/tests/lldb/java/BranchingFunCalls/src/rs/scalars.rscript b/tests/lldb/java/BranchingFunCalls/src/rs/scalars.rscript
new file mode 100644
index 0000000..7206330
--- /dev/null
+++ b/tests/lldb/java/BranchingFunCalls/src/rs/scalars.rscript

@@ -0,0 +1,76 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.branchingfuncalls)
+
+static bool is_neg(int a)
+{
+    if(a < 0)
+        return true;
+    else
+        return false;
+}
+
+static bool is_pos(int a)
+{
+    if(a > 0)
+        return true;
+    else
+        return false;
+}
+
+static void set_i(int * a, int b)
+{
+    int tmp = b;
+    *a = tmp;
+}
+
+static void modify_f(float * f)
+{
+    *f *= 0.5f;
+}
+
+static void modify_i(int * i)
+{
+    int j = *i;
+    int cutoff = 2 << 6;
+    if(j > cutoff)
+        j = cutoff;
+    if(is_neg(j))
+        set_i(i, 0);
+    else if(is_pos(j))
+        set_i(i, j);
+    else
+        set_i(i, cutoff);
+}
+
+int __attribute__((kernel)) simple_kernel(int in)
+{
+    int i = in;
+    float f = (float) i;
+    modify_f(&f);
+    modify_i(&i);
+    int ret = (int) f;
+    return in * ret;
+}
+
+int glob = 123;
+
+void addToGlobal(int arg)
+{
+    glob += arg;
+}

diff --git a/tests/lldb/java/DebugWaitAttach/Android.mk b/tests/lldb/java/DebugWaitAttach/Android.mk
new file mode 100644
index 0000000..3486b78
--- /dev/null
+++ b/tests/lldb/java/DebugWaitAttach/Android.mk

@@ -0,0 +1,15 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := JavaDebugWaitAttach
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_PRIVATE_PLATFORM_APIS := true
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
+
+include $(BUILD_PACKAGE)

diff --git a/tests/lldb/java/DebugWaitAttach/AndroidManifest.xml b/tests/lldb/java/DebugWaitAttach/AndroidManifest.xml
new file mode 100644
index 0000000..862af28
--- /dev/null
+++ b/tests/lldb/java/DebugWaitAttach/AndroidManifest.xml

@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.android.rs.waitattachdebug">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="JavaDebugWaitAttach"
+                 android:hardwareAccelerated="true">
+        <activity android:name="MainActivity">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+</manifest>

diff --git a/tests/lldb/java/DebugWaitAttach/res/layout/main_layout.xml b/tests/lldb/java/DebugWaitAttach/res/layout/main_layout.xml
new file mode 100644
index 0000000..4ef172f
--- /dev/null
+++ b/tests/lldb/java/DebugWaitAttach/res/layout/main_layout.xml

@@ -0,0 +1,14 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>

diff --git a/tests/lldb/java/DebugWaitAttach/src/com/android/rs/waitattachdebug/MainActivity.java b/tests/lldb/java/DebugWaitAttach/src/com/android/rs/waitattachdebug/MainActivity.java
new file mode 100644
index 0000000..3aeefd3
--- /dev/null
+++ b/tests/lldb/java/DebugWaitAttach/src/com/android/rs/waitattachdebug/MainActivity.java

@@ -0,0 +1,72 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.waitattachdebug;
+
+import android.app.Activity;
+import android.graphics.Bitmap;
+import android.os.Bundle;
+import android.widget.ImageView;
+import android.renderscript.*;
+
+public class MainActivity extends Activity {
+    private Bitmap mBitmapIn;
+    private Bitmap mBitmapOut;
+    private ImageView mImageView;
+
+    private RenderScript mRS;
+    private Allocation mInAllocation;
+    private Allocation mOutAllocation;
+    private ScriptC_simple mScript;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.main_layout);
+
+        mBitmapIn = Bitmap.createBitmap(8, 8, Bitmap.Config.ARGB_8888);
+        mBitmapOut = Bitmap.createBitmap(mBitmapIn.getWidth(),
+                    mBitmapIn.getHeight(), mBitmapIn.getConfig());
+
+        mImageView = findViewById(R.id.imageView);
+        mImageView.setImageBitmap(mBitmapOut);
+
+        createScript();
+        updateImage(1.0f);
+    }
+
+    private void createScript() {
+        mRS = RenderScript.create(this,
+            RenderScript.ContextType.NORMAL,
+            RenderScript.CREATE_FLAG_LOW_LATENCY |
+            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
+
+        mInAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
+        mOutAllocation = Allocation.createFromBitmap(mRS, mBitmapOut);
+
+        mScript = new ScriptC_simple(mRS);
+    }
+
+
+    private void updateImage(final float f) {
+        mScript.set_gColor(new Float4(0.9f, 0.8f, 0.5f, 1.0f));
+        mScript.forEach_simple_kernel(mInAllocation, mOutAllocation);
+        mOutAllocation.copyTo(mBitmapOut);
+        mScript.forEach_other_kernel(mInAllocation, mOutAllocation);
+    }
+}
+

diff --git a/tests/lldb/java/DebugWaitAttach/src/rs/simple.rscript b/tests/lldb/java/DebugWaitAttach/src/rs/simple.rscript
new file mode 100644
index 0000000..6b564c0
--- /dev/null
+++ b/tests/lldb/java/DebugWaitAttach/src/rs/simple.rscript

@@ -0,0 +1,39 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.waitattachdebug)
+
+float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
+
+/* RenderScript kernel that just sets the colour of the screen and does some
+ * simple operations so it is not completely empty
+ * (and can therefore be debugged).
+ */
+uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
+{
+    float4 temp = rsUnpackColor8888(in);
+    temp = gColor;
+    uchar4 result = rsPackColorTo8888(temp);
+    return result;
+}
+
+// Extra kernel to test lldb setting breakpoints on all the RS kernels.
+uchar4 __attribute__((kernel)) other_kernel(uchar4 in)
+{
+    uchar4 result = in.wzyx;
+    return result;
+}

diff --git a/tests/lldb/java/InfiniteLoop/Android.mk b/tests/lldb/java/InfiniteLoop/Android.mk
new file mode 100644
index 0000000..02e2790
--- /dev/null
+++ b/tests/lldb/java/InfiniteLoop/Android.mk

@@ -0,0 +1,15 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := JavaInfiniteLoop
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
+
+include $(BUILD_PACKAGE)

diff --git a/toolkit/test/AndroidManifest.xml b/tests/lldb/java/InfiniteLoop/AndroidManifest.xml
similarity index 60%
copy from toolkit/test/AndroidManifest.xml
copy to tests/lldb/java/InfiniteLoop/AndroidManifest.xml
index f709790..ba846cd 100644
--- a/toolkit/test/AndroidManifest.xml
+++ b/tests/lldb/java/InfiniteLoop/AndroidManifest.xml

@@ -1,15 +1,13 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-        package="com.example.testapp">
+    package="com.android.rs.infiniteloop">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="InfiniteLoop"
+                 android:hardwareAccelerated="true">
 
-    <application
-            android:allowBackup="true"
-            android:label="Toolkit Test"
-            android:supportsRtl="true">
-        <activity android:name=".MainActivity">
+        <activity android:name="MainActivity">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
-
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>

diff --git a/tests/lldb/java/InfiniteLoop/res/layout/main_layout.xml b/tests/lldb/java/InfiniteLoop/res/layout/main_layout.xml
new file mode 100755
index 0000000..4ef172f
--- /dev/null
+++ b/tests/lldb/java/InfiniteLoop/res/layout/main_layout.xml

@@ -0,0 +1,14 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>

diff --git a/tests/lldb/java/InfiniteLoop/src/com/android/rs/infiniteloop/MainActivity.java b/tests/lldb/java/InfiniteLoop/src/com/android/rs/infiniteloop/MainActivity.java
new file mode 100644
index 0000000..7243cc5
--- /dev/null
+++ b/tests/lldb/java/InfiniteLoop/src/com/android/rs/infiniteloop/MainActivity.java

@@ -0,0 +1,174 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.infiniteloop;
+
+import android.app.Activity;
+import android.graphics.Bitmap;
+import android.os.AsyncTask;
+import android.os.Bundle;
+import android.widget.ImageView;
+import android.renderscript.*;
+
+import java.util.Random;
+import java.util.Timer;
+import java.util.TimerTask;
+
+public class MainActivity extends Activity {
+    /* Number of bitmaps that are used for renderScript thread and UI thread synchronization.
+       Ideally, this can be reduced to 2, however in some devices, 2 buffers still showing tearing on UI.
+     */
+    private final int NUM_BITMAPS = 3;
+    private int mCurrentBitmap = 0;
+    private Bitmap mBitmapIn;
+    private Bitmap[] mBitmapsOut;
+    private ImageView mImageView;
+    private Random mRand;
+
+    private RenderScript mRS;
+    private Allocation mInAllocation;
+    private Allocation[] mOutAllocations;
+    private ScriptC_infiniteloop mScript;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.main_layout);
+        mRand = new Random();
+
+        /*
+         * Initialize UI
+         */
+        mBitmapIn = Bitmap.createBitmap(500, 500, Bitmap.Config.ARGB_8888);
+        mBitmapsOut = new Bitmap[NUM_BITMAPS];
+        for (int i = 0; i < NUM_BITMAPS; ++i) {
+            mBitmapsOut[i] = Bitmap.createBitmap(mBitmapIn.getWidth(),
+                    mBitmapIn.getHeight(), mBitmapIn.getConfig());
+        }
+
+        mImageView = findViewById(R.id.imageView);
+        mImageView.setImageBitmap(mBitmapsOut[mCurrentBitmap]);
+        mCurrentBitmap += (mCurrentBitmap + 1) % NUM_BITMAPS;
+
+        /*
+         * Create renderScript
+         */
+        createScript();
+
+        /*
+         * Invoke renderScript kernel and update imageView
+         */
+        updateImage(1.0f);
+
+        Timer t = new Timer();
+        t.schedule(new TimerTask() {
+            @Override
+            public void run() {
+                updateImage(1.f);
+            }
+        }, 2000, 2000);
+    }
+
+    /*
+     * Initialize RenderScript
+     * In the sample, it creates RenderScript kernel that performs saturation manipulation.
+     */
+    private void createScript() {
+        // Initialize RS
+        mRS = RenderScript.create(this,
+            RenderScript.ContextType.NORMAL,
+            RenderScript.CREATE_FLAG_LOW_LATENCY);
+
+        // Allocate buffers
+        mInAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
+        mOutAllocations = new Allocation[NUM_BITMAPS];
+        for (int i = 0; i < NUM_BITMAPS; ++i) {
+            mOutAllocations[i] = Allocation.createFromBitmap(mRS, mBitmapsOut[i]);
+        }
+
+        // Load script
+        mScript = new ScriptC_infiniteloop(mRS);
+    }
+
+    /*
+     * In the AsyncTask, it invokes a simple RenderScript kernel.
+     * After the kernel is done, an operation blocks at Allocation.copyTo() in AsyncTask thread.
+     * Once operations are finished and we reach onPostExecute() in the UI thread,
+     * it can invalidate and update the ImageView UI.
+     */
+    private class RenderScriptTask extends AsyncTask<Float, Integer, Integer> {
+        Boolean issued = false;
+
+        protected Integer doInBackground(Float... values) {
+            int index = -1;
+            if (isCancelled() == false) {
+                issued = true;
+                index = mCurrentBitmap;
+
+                /*
+                 * Set global variable in RS
+                 */
+                mScript.set_gColour(new Float4(mRand.nextFloat(), mRand.nextFloat(),
+                        mRand.nextFloat(), 1.f));
+
+                /*
+                 * Invoke saturation filter kernel
+                 */
+                mScript.forEach_simple_kernel(mInAllocation, mOutAllocations[index]);
+
+                /*
+                 * Copy to bitmap and invalidate image view
+                 */
+                mOutAllocations[index].copyTo(mBitmapsOut[index]);
+                mCurrentBitmap = (mCurrentBitmap + 1) % NUM_BITMAPS;
+            }
+            return index;
+        }
+
+        void updateView(Integer result) {
+            if (result != -1) {
+                // Request UI update
+                mImageView.setImageBitmap(mBitmapsOut[result]);
+                mImageView.invalidate();
+            }
+        }
+
+        protected void onPostExecute(Integer result) {
+            updateView(result);
+        }
+
+        protected void onCancelled(Integer result) {
+            if (issued) {
+                updateView(result);
+            }
+        }
+    }
+
+    RenderScriptTask currentTask = null;
+
+    /*
+     * Invoke AsyncTask and cancel the previous task.
+     * When AsyncTasks are piled up (typically in slow device with heavy kernel),
+     * only the latest (and already started) task invokes RenderScript operation.
+     */
+    private void updateImage(final float f) {
+        if (currentTask != null)
+            currentTask.cancel(false);
+        currentTask = new RenderScriptTask();
+        currentTask.execute(f);
+    }
+}

diff --git a/tests/lldb/java/InfiniteLoop/src/com/android/rs/infiniteloop/infiniteloop.rscript b/tests/lldb/java/InfiniteLoop/src/com/android/rs/infiniteloop/infiniteloop.rscript
new file mode 100644
index 0000000..5042ebf
--- /dev/null
+++ b/tests/lldb/java/InfiniteLoop/src/com/android/rs/infiniteloop/infiniteloop.rscript

@@ -0,0 +1,49 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.infiniteloop)
+#pragma rs_fp_relaxed
+
+float4 gColour = {0.299f, 0.587f, 0.114f, 1.f};
+
+/* RenderScript kernel that just sets the colour of the screen and does some
+ * simple operations so it is not completely empty
+ * (and can therefore be debugged).
+ */
+uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
+{
+    float4 temp = rsUnpackColor8888(in);
+    temp = gColour;
+    uchar4 result = rsPackColorTo8888(temp);
+    return result;
+}

diff --git a/tests/lldb/java/KernelVariables/Android.mk b/tests/lldb/java/KernelVariables/Android.mk
new file mode 100644
index 0000000..c642a9d
--- /dev/null
+++ b/tests/lldb/java/KernelVariables/Android.mk

@@ -0,0 +1,15 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := KernelVariables
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_PRIVATE_PLATFORM_APIS := true
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
+
+include $(BUILD_PACKAGE)

diff --git a/toolkit/test/AndroidManifest.xml b/tests/lldb/java/KernelVariables/AndroidManifest.xml
similarity index 60%
copy from toolkit/test/AndroidManifest.xml
copy to tests/lldb/java/KernelVariables/AndroidManifest.xml
index f709790..a1e2a74 100644
--- a/toolkit/test/AndroidManifest.xml
+++ b/tests/lldb/java/KernelVariables/AndroidManifest.xml

@@ -1,17 +1,15 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-        package="com.example.testapp">
-
-    <application
-            android:allowBackup="true"
-            android:label="Toolkit Test"
-            android:supportsRtl="true">
-        <activity android:name=".MainActivity">
+    package="com.android.rs.kernelvariables">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="KernelVariables"
+                 android:hardwareAccelerated="true">
+        <activity android:name="MainActivity">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
-
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>
     </application>
 </manifest>
+

diff --git a/tests/lldb/java/KernelVariables/res/layout/main_layout.xml b/tests/lldb/java/KernelVariables/res/layout/main_layout.xml
new file mode 100644
index 0000000..131c3b5
--- /dev/null
+++ b/tests/lldb/java/KernelVariables/res/layout/main_layout.xml

@@ -0,0 +1,15 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>
+

diff --git a/tests/lldb/java/KernelVariables/src/com/android/rs/kernelvariables/MainActivity.java b/tests/lldb/java/KernelVariables/src/com/android/rs/kernelvariables/MainActivity.java
new file mode 100644
index 0000000..728b872
--- /dev/null
+++ b/tests/lldb/java/KernelVariables/src/com/android/rs/kernelvariables/MainActivity.java

@@ -0,0 +1,110 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.kernelvariables;
+
+import android.app.Activity;
+import android.graphics.Bitmap;
+import android.graphics.ImageFormat;
+import android.os.Bundle;
+import android.widget.ImageView;
+import android.renderscript.*;
+
+public class MainActivity extends Activity {
+    private Bitmap mBitmapIn;
+    private Bitmap mBitmapOut;
+    private ImageView mImageView;
+
+    private RenderScript mRS;
+    private Allocation mInAllocation;
+    private Allocation mOutAllocation;
+    private ScriptC_simple mScript;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.main_layout);
+
+        mBitmapIn = Bitmap.createBitmap(500, 500, Bitmap.Config.ARGB_8888);
+        mBitmapOut = Bitmap.createBitmap(mBitmapIn.getWidth(),
+                    mBitmapIn.getHeight(), mBitmapIn.getConfig());
+
+        mImageView = findViewById(R.id.imageView);
+        mImageView.setImageBitmap(mBitmapOut);
+
+        createScript();
+        updateImage();
+    }
+
+    private void createScript() {
+        mRS = RenderScript.create(this,
+            RenderScript.ContextType.NORMAL,
+            RenderScript.CREATE_FLAG_LOW_LATENCY |
+            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
+
+        mInAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
+        mOutAllocation = Allocation.createFromBitmap(mRS, mBitmapOut);
+
+        mScript = new ScriptC_simple(mRS);
+    }
+
+    private void updateImage() {
+        int[] buffer_int = {1, 2, 3, 4};
+        Allocation int_allocation = Allocation.createSized(mRS, Element.I32(mRS), 4);
+        int_allocation.copyFrom(buffer_int);
+        mScript.set_allocation_1D_global(int_allocation);
+
+        int[] buffer_int2 = {5, 6, 7, 8};
+
+        Type.Builder typeI32Builder2D = new Type.Builder(mRS, Element.I32(mRS));
+        typeI32Builder2D.setX(2);
+        typeI32Builder2D.setY(2);
+
+        Allocation int_allocation2 = Allocation.createTyped(mRS, typeI32Builder2D.create());
+        int_allocation2.copyFrom(buffer_int2);
+        mScript.set_allocation_1D_global2(int_allocation2);
+
+        mScript.set_allocation_2D_global(mInAllocation);
+        mScript.set_allocation_2D_global2(mOutAllocation);
+
+        int[] buffer_int3 = new int[64];
+
+        for (int i=0; i<4*4*4; ++i)
+            buffer_int3[i] = 9 + i;
+
+        Type.Builder typeI32Builder3D = new Type.Builder(mRS, Element.I32(mRS));
+        typeI32Builder3D.setX(4);
+        typeI32Builder3D.setY(4);
+        typeI32Builder3D.setZ(4);
+
+        Allocation int_allocation3 = Allocation.createTyped(mRS, typeI32Builder3D.create());
+        int_allocation3.copyFrom(buffer_int3);
+        mScript.set_allocation_3D_global(int_allocation3);
+
+        Type.Builder yuvTypeBuilder = new Type.Builder(mRS, Element.YUV(mRS));
+        yuvTypeBuilder.setX(4);
+        yuvTypeBuilder.setY(4);
+        yuvTypeBuilder.setYuvFormat(ImageFormat.YV12);
+        Allocation yuv_allocation = Allocation.createTyped(mRS, yuvTypeBuilder.create());
+        mScript.set_allocation_YUV_2D_global(yuv_allocation);
+
+        mScript.set_sampler_global(Sampler.CLAMP_LINEAR(mRS));
+
+        mScript.forEach_kernel(mInAllocation, mOutAllocation);
+        mOutAllocation.copyTo(mBitmapOut);
+    }
+}

diff --git a/tests/lldb/java/KernelVariables/src/rs/simple.rscript b/tests/lldb/java/KernelVariables/src/rs/simple.rscript
new file mode 100644
index 0000000..26b6aff
--- /dev/null
+++ b/tests/lldb/java/KernelVariables/src/rs/simple.rscript

@@ -0,0 +1,197 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.kernelvariables)
+
+char char_global = 12;
+uchar uchar_global = 234;
+short short_global = -321;
+ushort ushort_global = 432;
+int int_global = 1234;
+uint uint_global = 2345;
+float float_global = 4.5f;
+long long_global = -77777;
+ulong ulong_global = 8888;
+double double_global = -456.5f;
+
+char2 char2_global = {11, -22};
+uchar2 uchar2_global = {33, 44};
+short2 short2_global = {-555, 666};
+ushort2 ushort2_global = {777, 888};
+int2 int2_global = {999, -1111};
+uint2 uint2_global = {2222, 3333};
+float2 float2_global = {4.5f, -5.0f};
+long2 long2_global = {-4444, 5555};
+ulong2 ulong2_global = {6666, 7777};
+double2 double2_global = {88.5f, -99.0f};
+
+char3 char3_global = {11, -22, -33};
+uchar3 uchar3_global = {33, 44, 55};
+short3 short3_global = {-555, 666, 777};
+ushort3 ushort3_global = {777, 888, 999};
+int3 int3_global = {999, -1111, 2222};
+uint3 uint3_global = {2222, 3333, 4444};
+float3 float3_global = {4.5f, -5.0f, -6.5f};
+long3 long3_global = {-4444, 5555, 6666};
+ulong3 ulong3_global = {6666, 7777, 8888};
+double3 double3_global = {88.5f, -99.0f, 111.5f};
+
+char4 char4_global = {55, 11, -22, -33};
+uchar4 uchar4_global = {222, 33, 44, 55};
+short4 short4_global = {-444, -555, 666, 777};
+ushort4 ushort4_global = {666, 777, 888, 999};
+int4 int4_global = {888, 999, -1111, 2222};
+uint4 uint4_global = {1111, 2222, 3333, 4444};
+float4 float4_global = {3.0f, 4.5f, -5.0f, -6.5f};
+long4 long4_global = {-3333, -4444, 5555, 6666};
+ulong4 ulong4_global = {5555, 6666, 7777, 8888};
+double4 double4_global = {-77.0f, 88.5f, -99.0f, 111.5f};
+
+rs_matrix2x2 matrix2x2_global;
+rs_matrix3x3 matrix3x3_global;
+rs_matrix4x4 matrix4x4_global;
+
+rs_quaternion quaternion_global;
+
+rs_allocation allocation_1D_global;
+rs_allocation allocation_1D_global2;
+rs_allocation allocation_2D_global;
+rs_allocation allocation_2D_global2;
+rs_allocation allocation_3D_global;
+rs_allocation allocation_YUV_2D_global;
+
+rs_allocation_cubemap_face cubemap_face_global;
+rs_sampler sampler_global;
+
+uchar4 __attribute__((kernel)) kernel(uchar4 in)
+{
+    char char_local = 'a';
+    uchar uchar_local = 'b';
+    short short_local = -321;
+    ushort ushort_local = 432;
+    int int_local = 1234;
+    uint uint_local = 2345;
+    float float_local = 4.5f;
+    long long_local = -77777;
+    ulong ulong_local = 8888;
+    double double_local = -456.5f;
+
+    char2 char2_local = {-11, -22};
+    uchar2 uchar2_local = {33, 44};
+    short2 short2_local = {-555, 666};
+    ushort2 ushort2_local = {777, 888};
+    int2 int2_local = {999, -1111};
+    uint2 uint2_local = {2222, 3333};
+    float2 float2_local = {4.5f, -5.0f};
+    long2 long2_local = {-4444, 5555};
+    ulong2 ulong2_local = {6666, 7777};
+    double2 double2_local = {88.5f, -99.0f};
+
+    char3 char3_local = {11, -22, -33};
+    uchar3 uchar3_local = {33, 44, 55};
+    short3 short3_local = {-555, 666, 777};
+    ushort3 ushort3_local = {777, 888, 999};
+    int3 int3_local = {999, -1111, 2222};
+    uint3 uint3_local = {2222, 3333, 4444};
+    float3 float3_local = {4.5f, -5.0f, -6.5f};
+    long3 long3_local = {-4444, 5555, 6666};
+    ulong3 ulong3_local = {6666, 7777, 8888};
+    double3 double3_local = {88.5f, -99.0f, 111.5f};
+
+    char4 char4_local = {55, 11, -22, -33};
+    uchar4 uchar4_local = {22, 33, 44, 55};
+    short4 short4_local = {-444, -555, 666, 777};
+    ushort4 ushort4_local = {666, 777, 888, 999};
+    int4 int4_local = {888, 999, -1111, 2222};
+    uint4 uint4_local = {1111, 2222, 3333, 4444};
+    float4 float4_local = {3.0f, 4.5f, -5.0f, -6.5f};
+    long4 long4_local = {-3333, -4444, 5555, 6666};
+    ulong4 ulong4_local = {5555, 6666, 7777, 8888};
+    double4 double4_local = {-77.0f, 88.5f, -99.0f, 111.5f};
+
+    rs_matrix2x2 matrix2x2_local = {{1., 2.5,
+                                     3., 4.5}};
+    rs_matrix3x3 matrix3x3_local = {{5., 6.5, 7.,
+                                     8.5, 9., 1.5,
+                                     2., 3.5, 4.}};
+    rs_matrix4x4 matrix4x4_local = {{5.5, 6., 7.5, 8.,
+                                     9., 1.5, 2., 3.5,
+                                     4.5, 5.5, 6.5, 7.,
+                                     8., 9.5, 1.5, 2.5}};
+
+    matrix2x2_global = matrix2x2_local;
+    matrix3x3_global = matrix3x3_local;
+    matrix4x4_global = matrix4x4_local;
+
+    rsQuaternionSet(&quaternion_global, 3.0, 4.5, 5.5, 6.0);
+
+    rs_quaternion quaternion_local;
+    rsQuaternionSet(&quaternion_local, 7.5, 8.0, 9.0, 0.5);
+
+    char char_combined = char_local + (char)uchar_local + char2_local.x +
+        (char)uchar2_local.x + char3_local.x - (char)uchar3_local.x +
+        char4_local.x + (char)uchar4_local.x;
+
+    short short_combined = short_local + (short)ushort_local + short2_local.x +
+        (short)ushort2_local.x + short3_local.x + (short)ushort3_local.x +
+        short4_local.x + (short)ushort4_local.x;
+
+    int int_combined = int_local + (int)uint_local + int2_local.x +
+        (int)uint2_local.x + int3_local.x + (int)uint3_local.x + int4_local.x +
+        (int)uint4_local.x;
+
+    float float_combined = float_local + float2_local.x + float3_local.x +
+        float4_local.x;
+
+    long long_combined = long_local + (long)ulong_local + long2_local.x +
+        (long)ulong2_local.x + long3_local.x + (long)ulong3_local.x +
+        long4_local.x + (long)ulong4_local.x;
+
+    double double_combined = double_local + double2_local.x + double3_local.x +
+        double4_local.x;
+
+    char_global = char_combined;
+    short_global = short_combined;
+    int_global = int_combined;
+    float_global = float_combined;
+    long_global = long_combined;
+    double_global = double_combined;
+
+    uchar4 result = {1,2,3,4};
+    return result;
+}
+
+float use_constants_global;
+
+void setup(void)
+{
+  use_constants_global =
+      M_1_PI +
+      M_2_PI +
+      M_2_PIl +
+      M_2_SQRTPI +
+      M_E +
+      M_LN10 +
+      M_LN2 +
+      M_LOG10E +
+      M_LOG2E +
+      M_PI +
+      M_PI_2 +
+      M_PI_4 +
+      M_SQRT1_2 +
+      M_SQRT2;
+}

diff --git a/tests/lldb/java/MultipleRSFiles/Android.mk b/tests/lldb/java/MultipleRSFiles/Android.mk
new file mode 100644
index 0000000..b45cc51
--- /dev/null
+++ b/tests/lldb/java/MultipleRSFiles/Android.mk

@@ -0,0 +1,15 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := MultipleRSFiles
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_PRIVATE_PLATFORM_APIS := true
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
+
+include $(BUILD_PACKAGE)

diff --git a/toolkit/test/AndroidManifest.xml b/tests/lldb/java/MultipleRSFiles/AndroidManifest.xml
similarity index 60%
copy from toolkit/test/AndroidManifest.xml
copy to tests/lldb/java/MultipleRSFiles/AndroidManifest.xml
index f709790..5a83901 100644
--- a/toolkit/test/AndroidManifest.xml
+++ b/tests/lldb/java/MultipleRSFiles/AndroidManifest.xml

@@ -1,15 +1,12 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-        package="com.example.testapp">
-
-    <application
-            android:allowBackup="true"
-            android:label="Toolkit Test"
-            android:supportsRtl="true">
-        <activity android:name=".MainActivity">
+    package="com.android.rs.multiplersfiles">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="MultipleRSFiles"
+                 android:hardwareAccelerated="true">
+        <activity android:name="MainActivity">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
-
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>

diff --git a/tests/lldb/java/MultipleRSFiles/res/layout/main_layout.xml b/tests/lldb/java/MultipleRSFiles/res/layout/main_layout.xml
new file mode 100644
index 0000000..4ef172f
--- /dev/null
+++ b/tests/lldb/java/MultipleRSFiles/res/layout/main_layout.xml

@@ -0,0 +1,14 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>

diff --git a/tests/lldb/java/MultipleRSFiles/src/com/android/rs/multiplersfiles/MainActivity.java b/tests/lldb/java/MultipleRSFiles/src/com/android/rs/multiplersfiles/MainActivity.java
new file mode 100644
index 0000000..7895484
--- /dev/null
+++ b/tests/lldb/java/MultipleRSFiles/src/com/android/rs/multiplersfiles/MainActivity.java

@@ -0,0 +1,74 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.multiplersfiles;
+
+import android.app.Activity;
+import android.graphics.Bitmap;
+import android.os.Bundle;
+import android.widget.ImageView;
+import android.renderscript.*;
+
+public class MainActivity extends Activity {
+    private Bitmap mBitmapIn;
+    private Bitmap mBitmapOut;
+    private ImageView mImageView;
+
+    private RenderScript mRS;
+    private Allocation mInAllocation;
+    private Allocation mOutAllocation;
+    private ScriptC_first mFirstScript;
+    private ScriptC_second mSecondScript;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.main_layout);
+
+        mBitmapIn = Bitmap.createBitmap(500, 500, Bitmap.Config.ARGB_8888);
+        mBitmapOut = Bitmap.createBitmap(mBitmapIn.getWidth(),
+                    mBitmapIn.getHeight(), mBitmapIn.getConfig());
+
+        mImageView = findViewById(R.id.imageView);
+        mImageView.setImageBitmap(mBitmapOut);
+
+        createScript();
+        updateImage(1.0f);
+    }
+
+    private void createScript() {
+        mRS = RenderScript.create(this,
+            RenderScript.ContextType.NORMAL,
+            RenderScript.CREATE_FLAG_LOW_LATENCY |
+            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
+
+        mInAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
+        mOutAllocation = Allocation.createFromBitmap(mRS, mBitmapOut);
+
+        mFirstScript = new ScriptC_first(mRS);
+        mSecondScript = new ScriptC_second(mRS);
+    }
+
+
+    private void updateImage(final float f) {
+        mFirstScript.set_gColor(new Float4(0.9f, 0.8f, 0.5f, 1.0f));
+        mFirstScript.forEach_first_kernel(mInAllocation, mOutAllocation);
+        mOutAllocation.copyTo(mBitmapOut);
+        mSecondScript.forEach_second_kernel(mInAllocation, mOutAllocation);
+    }
+}
+

diff --git a/tests/lldb/java/MultipleRSFiles/src/rs/first.rscript b/tests/lldb/java/MultipleRSFiles/src/rs/first.rscript
new file mode 100644
index 0000000..cbed426
--- /dev/null
+++ b/tests/lldb/java/MultipleRSFiles/src/rs/first.rscript

@@ -0,0 +1,32 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.multiplersfiles)
+
+float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
+
+/* RenderScript kernel that just sets the colour of the screen and does some
+ * simple operations so it is not completely empty
+ * (and can therefore be debugged).
+ */
+uchar4 __attribute__((kernel)) first_kernel(uchar4 in)
+{
+    float4 temp = rsUnpackColor8888(in);
+    temp = gColor;
+    uchar4 result = rsPackColorTo8888(temp);
+    return result;
+}

diff --git a/tests/lldb/java/MultipleRSFiles/src/rs/second.rscript b/tests/lldb/java/MultipleRSFiles/src/rs/second.rscript
new file mode 100644
index 0000000..c87d7ae
--- /dev/null
+++ b/tests/lldb/java/MultipleRSFiles/src/rs/second.rscript

@@ -0,0 +1,25 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.multiplersfiles)
+
+/* RenderScript kernel that just returns the swizzled input. */
+uchar4 __attribute__((kernel)) second_kernel(uchar4 in)
+{
+    uchar4 result = in.wzyx;
+    return result;
+}

diff --git a/tests/lldb/java/NoDebugWaitAttach/Android.mk b/tests/lldb/java/NoDebugWaitAttach/Android.mk
new file mode 100644
index 0000000..355ffb7
--- /dev/null
+++ b/tests/lldb/java/NoDebugWaitAttach/Android.mk

@@ -0,0 +1,13 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := JavaNoDebugWaitAttach
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_PRIVATE_PLATFORM_APIS := true
+
+include $(BUILD_PACKAGE)

diff --git a/tests/lldb/java/NoDebugWaitAttach/AndroidManifest.xml b/tests/lldb/java/NoDebugWaitAttach/AndroidManifest.xml
new file mode 100644
index 0000000..dbc31a5
--- /dev/null
+++ b/tests/lldb/java/NoDebugWaitAttach/AndroidManifest.xml

@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.android.rs.waitattachnodebug">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="JavaNoDebugWaitAttach"
+                 android:hardwareAccelerated="true">
+        <activity android:name="MainActivity">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+</manifest>

diff --git a/tests/lldb/java/NoDebugWaitAttach/res/layout/main_layout.xml b/tests/lldb/java/NoDebugWaitAttach/res/layout/main_layout.xml
new file mode 100644
index 0000000..4ef172f
--- /dev/null
+++ b/tests/lldb/java/NoDebugWaitAttach/res/layout/main_layout.xml

@@ -0,0 +1,14 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>

diff --git a/tests/lldb/java/NoDebugWaitAttach/src/com/android/rs/waitattachnodebug/MainActivity.java b/tests/lldb/java/NoDebugWaitAttach/src/com/android/rs/waitattachnodebug/MainActivity.java
new file mode 100644
index 0000000..a43b387
--- /dev/null
+++ b/tests/lldb/java/NoDebugWaitAttach/src/com/android/rs/waitattachnodebug/MainActivity.java

@@ -0,0 +1,71 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.waitattachnodebug;
+
+import android.app.Activity;
+import android.graphics.Bitmap;
+import android.os.Bundle;
+import android.widget.ImageView;
+import android.renderscript.*;
+
+public class MainActivity extends Activity {
+    private Bitmap mBitmapIn;
+    private Bitmap mBitmapOut;
+    private ImageView mImageView;
+
+    private RenderScript mRS;
+    private Allocation mInAllocation;
+    private Allocation mOutAllocation;
+    private ScriptC_simple mScript;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.main_layout);
+
+        mBitmapIn = Bitmap.createBitmap(8, 8, Bitmap.Config.ARGB_8888);
+        mBitmapOut = Bitmap.createBitmap(mBitmapIn.getWidth(),
+                    mBitmapIn.getHeight(), mBitmapIn.getConfig());
+
+        mImageView = findViewById(R.id.imageView);
+        mImageView.setImageBitmap(mBitmapOut);
+
+        createScript();
+        updateImage(1.0f);
+    }
+
+    private void createScript() {
+        mRS = RenderScript.create(this,
+            RenderScript.ContextType.NORMAL,
+            RenderScript.CREATE_FLAG_LOW_LATENCY |
+            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
+
+        mInAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
+        mOutAllocation = Allocation.createFromBitmap(mRS, mBitmapOut);
+
+        mScript = new ScriptC_simple(mRS);
+    }
+
+
+    private void updateImage(final float f) {
+        mScript.set_gColor(new Float4(0.9f, 0.8f, 0.5f, 1.0f));
+        mScript.forEach_simple_kernel(mInAllocation, mOutAllocation);
+        mOutAllocation.copyTo(mBitmapOut);
+    }
+}
+

diff --git a/tests/lldb/java/NoDebugWaitAttach/src/rs/simple.rscript b/tests/lldb/java/NoDebugWaitAttach/src/rs/simple.rscript
new file mode 100644
index 0000000..6652675
--- /dev/null
+++ b/tests/lldb/java/NoDebugWaitAttach/src/rs/simple.rscript

@@ -0,0 +1,33 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.waitattachnodebug)
+
+float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
+
+/* RenderScript kernel that just sets the colour of the screen and does some
+ * simple operations so it is not completely empty
+ * (and can therefore be debugged).
+ */
+uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
+{
+    float4 temp = rsUnpackColor8888(in);
+    temp = gColor;
+    uchar4 result = rsPackColorTo8888(temp);
+    return result;
+}
+

diff --git a/tests/lldb/java/Reduction/Android.mk b/tests/lldb/java/Reduction/Android.mk
new file mode 100644
index 0000000..6e71659
--- /dev/null
+++ b/tests/lldb/java/Reduction/Android.mk

@@ -0,0 +1,31 @@
+#
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_RENDERSCRIPT_FLAGS := -target-api 0 -O0 -g
+
+LOCAL_PACKAGE_NAME := Reduction
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_PRIVATE_PLATFORM_APIS := true
+
+include $(BUILD_PACKAGE)

diff --git a/toolkit/test/AndroidManifest.xml b/tests/lldb/java/Reduction/AndroidManifest.xml
similarity index 61%
rename from toolkit/test/AndroidManifest.xml
rename to tests/lldb/java/Reduction/AndroidManifest.xml
index f709790..61177d9 100644
--- a/toolkit/test/AndroidManifest.xml
+++ b/tests/lldb/java/Reduction/AndroidManifest.xml

@@ -1,15 +1,11 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-        package="com.example.testapp">
-
-    <application
-            android:allowBackup="true"
-            android:label="Toolkit Test"
-            android:supportsRtl="true">
-        <activity android:name=".MainActivity">
+    package="com.android.rs.lldbreductiontest">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="Reduction">
+        <activity android:name="MainActivity" android:screenOrientation="portrait">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
-
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>

diff --git a/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/MainActivity.java b/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/MainActivity.java
new file mode 100644
index 0000000..b10d5d2
--- /dev/null
+++ b/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/MainActivity.java

@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This application is a simple scheduler for testing the lldb debugger
+ * implementation for general reduction kernels.
+ *
+ * It launches one of two simple reductions in a loop
+ */
+
+package com.android.rs.lldbreductiontest;
+
+import android.app.Activity;
+import android.content.Context;
+import android.content.res.Resources;
+import android.os.Handler;
+import android.os.Bundle;
+import android.util.Log;
+import android.renderscript.*;
+
+import java.lang.Float;
+import java.lang.Math;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Random;
+
+public class MainActivity extends Activity {
+  static private int idxOffset = 10;
+  static private int mX = 128;
+  static private int mY = 2;
+  static private int mZ = 2;
+  static private float mMultiplier = 2.f;
+  private RenderScript mRS;
+  private ScriptC_reduce mScript;
+  private ScriptC_reduce_auto_comb mScript_auto_comb;
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+    initRS();
+
+    int loopDelayMillis = 1000;
+    Handler loopHandler = new Handler();
+    loopHandler.postDelayed(new Runnable() {
+      @Override
+      public void run() {
+        runRS();
+        loopHandler.postDelayed(this, loopDelayMillis);
+      }
+    }, loopDelayMillis);
+  }
+
+  private float findMinUserTypeAutoComb(
+      RenderScript rs, ScriptC_reduce_auto_comb s, Allocation alloc) {
+    s.set_a_startval(mX);
+    s.set_b_startval(mY);
+    s.set_multiplier(mMultiplier);
+
+    return s.reduce_find_min_user_type_auto_comb(alloc).get();
+  }
+
+  private float findMinUserType(RenderScript rs, ScriptC_reduce s, Allocation alloc) {
+    s.set_a_startval(mX);
+    s.set_b_startval(mY);
+    s.set_multiplier(mMultiplier);
+
+    return s.reduce_find_min_user_type(alloc).get();
+  }
+
+  private float findMinUserType1DAutoComb(RenderScript rs, ScriptC_reduce_auto_comb s, int xCount) {
+    ScriptField_MinUserType minUserType = new ScriptField_MinUserType(rs, xCount);
+    for (int i = 0; i < xCount; i++) {
+      ScriptField_MinUserType.Item val = new ScriptField_MinUserType.Item();
+      val.a = i + idxOffset;
+      val.b = i + idxOffset;
+      minUserType.set(val, i, true);
+    }
+
+    Allocation alloc = minUserType.getAllocation();
+
+    return findMinUserTypeAutoComb(rs, s, alloc);
+  }
+
+  private float findMinUserType1D(RenderScript rs, ScriptC_reduce s, int xCount) {
+    ScriptField_MinUserType minUserType = new ScriptField_MinUserType(rs, xCount);
+    for (int i = 0; i < xCount; i++) {
+      ScriptField_MinUserType.Item val = new ScriptField_MinUserType.Item();
+      val.a = i + idxOffset;
+      val.b = i + idxOffset;
+      minUserType.set(val, i, true);
+    }
+
+    Allocation alloc = minUserType.getAllocation();
+
+    return findMinUserType(rs, s, alloc);
+  }
+
+  public void initRS() {
+    mRS = RenderScript.create(this, RenderScript.ContextType.NORMAL,
+        RenderScript.CREATE_FLAG_LOW_LATENCY | RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
+    mScript = new ScriptC_reduce(mRS);
+    mScript_auto_comb = new ScriptC_reduce_auto_comb(mRS);
+  }
+
+  public void runRS() {
+    findMinUserType1D(mRS, mScript, mX);
+    findMinUserType1DAutoComb(mRS, mScript_auto_comb, mX);
+  }
+
+  public void onDestroy() {
+    mRS.finish();
+    mRS.destroy();
+  }
+}

diff --git a/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce.rscript b/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce.rscript
new file mode 100644
index 0000000..c8fb088
--- /dev/null
+++ b/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce.rscript

@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This application is a simple scheduler for testing the lldb debugger
+ * implementation for general reduction kernels.
+ *
+ * It launches one of two simple reductions in a loop
+ */
+
+#pragma rs java_package_name(com.android.rs.lldbreductiontest)
+#pragma version(1)
+#pragma rs reduce(find_min_user_type) initializer(find_min_user_type_init) \
+    accumulator(find_min_user_type_accum)                                  \
+        outconverter(find_min_user_type_outc)                              \
+            combiner(find_min_user_type_comb)
+
+#define RSTESTS_USER_COMBINER
+#include "reduce_common.rsh"

diff --git a/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce_auto_comb.rscript b/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce_auto_comb.rscript
new file mode 100644
index 0000000..5a9de90
--- /dev/null
+++ b/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce_auto_comb.rscript

@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This application is a simple scheduler for testing the lldb debugger
+ * implementation for general reduction kernels.
+ *
+ * It launches one of two simple reductions in a loop
+ */
+
+#pragma rs java_package_name(com.android.rs.lldbreductiontest)
+#pragma version(1)
+#pragma rs reduce(find_min_user_type_auto_comb)                                \
+    initializer(find_min_user_type_init) accumulator(find_min_user_type_accum) \
+        outconverter(find_min_user_type_outc)                                  \
+
+#include "reduce_common.rsh"

diff --git a/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce_common.rsh b/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce_common.rsh
new file mode 100644
index 0000000..846d530
--- /dev/null
+++ b/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce_common.rsh

@@ -0,0 +1,59 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+int __attribute__((kernel)) my_foreach_kernel(int a) {
+  // This kernel is unused, but we want to make sure it is not listed as a
+  // reduction kernel by the debugger
+  return a + 1;
+}
+
+typedef struct MinUserType {
+  int32_t a;
+  int32_t b;
+} user_t;
+
+int32_t b_startval;
+int32_t a_startval;
+float multiplier;
+
+static void find_min_user_type_init(user_t *alloc) {
+  alloc->a = a_startval;
+  alloc->b = b_startval;
+}
+
+static void find_min_user_type_accum(user_t *accum, const user_t val) {
+  if (val.a + val.b * multiplier < accum->a + accum->b * multiplier) {
+    accum->a = val.a;
+    accum->b = val.b;
+  }
+}
+
+// Combiners are autogenerated if the user has not defined the combiner.
+// We specialise the tests for lldb's handling of this behaviour as well,
+// generating two test apps from the same source.
+// This combiner is equivalent to the accumulator.
+#if defined(RSTESTS_USER_COMBINER)
+static void find_min_user_type_comb(user_t *accum, const user_t *val) {
+  if (val->a + val->b * multiplier < accum->a + accum->b * multiplier) {
+    accum->a = val->a;
+    accum->b = val->b;
+  }
+}
+#endif
+
+static void find_min_user_type_outc(float *output, const user_t *val) {
+  *output = val->a + val->b * multiplier;
+}

diff --git a/tests/lldb/java/ScriptGroup/Android.mk b/tests/lldb/java/ScriptGroup/Android.mk
new file mode 100644
index 0000000..18a1cd5
--- /dev/null
+++ b/tests/lldb/java/ScriptGroup/Android.mk

@@ -0,0 +1,15 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := ScriptGroup
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_PRIVATE_PLATFORM_APIS := true
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
+
+include $(BUILD_PACKAGE)

diff --git a/toolkit/test/AndroidManifest.xml b/tests/lldb/java/ScriptGroup/AndroidManifest.xml
similarity index 60%
copy from toolkit/test/AndroidManifest.xml
copy to tests/lldb/java/ScriptGroup/AndroidManifest.xml
index f709790..5288c74 100644
--- a/toolkit/test/AndroidManifest.xml
+++ b/tests/lldb/java/ScriptGroup/AndroidManifest.xml

@@ -1,15 +1,12 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-        package="com.example.testapp">
-
-    <application
-            android:allowBackup="true"
-            android:label="Toolkit Test"
-            android:supportsRtl="true">
-        <activity android:name=".MainActivity">
+    package="com.android.rs.scriptgroup">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="scriptgroup"
+                 android:hardwareAccelerated="true">
+        <activity android:name="MainActivity">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
-
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>

diff --git a/tests/lldb/java/ScriptGroup/res/layout/main_layout.xml b/tests/lldb/java/ScriptGroup/res/layout/main_layout.xml
new file mode 100644
index 0000000..4ef172f
--- /dev/null
+++ b/tests/lldb/java/ScriptGroup/res/layout/main_layout.xml

@@ -0,0 +1,14 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>

diff --git a/tests/lldb/java/ScriptGroup/src/com/android/rs/scriptgroup/MainActivity.java b/tests/lldb/java/ScriptGroup/src/com/android/rs/scriptgroup/MainActivity.java
new file mode 100644
index 0000000..01e8a13
--- /dev/null
+++ b/tests/lldb/java/ScriptGroup/src/com/android/rs/scriptgroup/MainActivity.java

@@ -0,0 +1,65 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.scriptgroup;
+
+import android.app.Activity;
+import android.graphics.Bitmap;
+import android.os.Bundle;
+import android.widget.ImageView;
+import android.renderscript.*;
+
+public class MainActivity extends Activity {
+    private static final int ARRAY_SIZE = 8;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.main_layout);
+
+        // create renderscript context
+        RenderScript pRS = RenderScript.create(this, RenderScript.ContextType.NORMAL,
+            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH | RenderScript.CREATE_FLAG_LOW_LATENCY);
+
+        ScriptC_scriptgroup script = new ScriptC_scriptgroup(pRS);
+
+        // create and initalize a simple input allocation
+        int[] array = new int[ARRAY_SIZE];
+        for (int i = 0; i < ARRAY_SIZE; i++) {
+            array[i] = i;
+        }
+        Allocation input = Allocation.createSized(pRS, Element.I32(pRS), ARRAY_SIZE);
+        input.copyFrom(array);
+
+        ScriptGroup.Builder2 builder = new ScriptGroup.Builder2(pRS);
+
+        ScriptGroup.Input unbound = builder.addInput();
+
+        ScriptGroup.Closure c0 = builder.addKernel(
+            script.getKernelID_foo(), Type.createX(pRS, Element.I32(pRS), ARRAY_SIZE), unbound);
+
+        ScriptGroup.Closure c1 = builder.addKernel(script.getKernelID_goo(),
+            Type.createX(pRS, Element.I32(pRS), ARRAY_SIZE), c0.getReturn());
+
+        ScriptGroup group = builder.create("scriptgroup_test", c1.getReturn());
+
+        int[] a = new int[ARRAY_SIZE];
+        ((Allocation) group.execute(input)[0]).copyTo(a);
+
+        pRS.finish();
+        pRS.destroy();
+    }
+}

diff --git a/tests/lldb/java/ScriptGroup/src/rs/scriptgroup.rscript b/tests/lldb/java/ScriptGroup/src/rs/scriptgroup.rscript
new file mode 100644
index 0000000..29089e1
--- /dev/null
+++ b/tests/lldb/java/ScriptGroup/src/rs/scriptgroup.rscript

@@ -0,0 +1,27 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.scriptgroup)
+#pragma rs_fp_full
+
+int __attribute__((kernel)) foo(int a) {
+    return a * a;
+}
+
+int __attribute__((kernel)) goo(int a) {
+    return a + a;
+}

diff --git a/tests/lldb/java/SingleSource/Android.mk b/tests/lldb/java/SingleSource/Android.mk
new file mode 100644
index 0000000..202c2a7
--- /dev/null
+++ b/tests/lldb/java/SingleSource/Android.mk

@@ -0,0 +1,15 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := SingleSource
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_PRIVATE_PLATFORM_APIS := true
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
+
+include $(BUILD_PACKAGE)

diff --git a/toolkit/test/AndroidManifest.xml b/tests/lldb/java/SingleSource/AndroidManifest.xml
similarity index 60%
copy from toolkit/test/AndroidManifest.xml
copy to tests/lldb/java/SingleSource/AndroidManifest.xml
index f709790..8820c93 100644
--- a/toolkit/test/AndroidManifest.xml
+++ b/tests/lldb/java/SingleSource/AndroidManifest.xml

@@ -1,17 +1,15 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-        package="com.example.testapp">
-
-    <application
-            android:allowBackup="true"
-            android:label="Toolkit Test"
-            android:supportsRtl="true">
-        <activity android:name=".MainActivity">
+    package="com.android.rs.singlesource">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="SingleSource"
+                 android:hardwareAccelerated="true">
+        <activity android:name="MainActivity">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
-
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>
     </application>
 </manifest>
+

diff --git a/tests/lldb/java/SingleSource/res/layout/main_layout.xml b/tests/lldb/java/SingleSource/res/layout/main_layout.xml
new file mode 100644
index 0000000..4ef172f
--- /dev/null
+++ b/tests/lldb/java/SingleSource/res/layout/main_layout.xml

@@ -0,0 +1,14 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>

diff --git a/tests/lldb/java/SingleSource/src/com/android/rs/singlesource/MainActivity.java b/tests/lldb/java/SingleSource/src/com/android/rs/singlesource/MainActivity.java
new file mode 100644
index 0000000..8308043
--- /dev/null
+++ b/tests/lldb/java/SingleSource/src/com/android/rs/singlesource/MainActivity.java

@@ -0,0 +1,77 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.singlesource;
+
+import android.app.Activity;
+import android.graphics.Bitmap;
+import android.os.Bundle;
+import android.widget.ImageView;
+import android.renderscript.*;
+
+public class MainActivity extends Activity {
+
+    private RenderScript mRS;
+    private Allocation mAllocIn1;
+    private Allocation mAllocIn2;
+    private Allocation mAllocOut;
+    private ScriptC_rs_single_source mScript;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+
+        super.onCreate(savedInstanceState);
+
+        setContentView(R.layout.main_layout);
+
+        // create renderscript context
+        mRS = RenderScript.create(
+              this,
+              RenderScript.ContextType.NORMAL,
+              RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH |
+              RenderScript.CREATE_FLAG_LOW_LATENCY);
+
+        // create a new instance of the script
+        mScript = new ScriptC_rs_single_source(mRS);
+
+        // create the first input allocation
+        mAllocIn1 = Allocation.createSized(mRS, Element.F32(mRS), 4);
+        float [] in1 = new float[]{ 1.f, 2.f, 3.f, 4.f };
+        mAllocIn1.copyFrom(in1);
+
+        // create second input allocation
+        mAllocIn2 = Allocation.createSized(mRS, Element.F32(mRS), 4);
+        float [] in2 = new float[]{ 5.f, 6.f, 7.f, 8.f };
+        mAllocIn2.copyFrom(in2);
+
+        // create output allocation
+        mAllocOut = Allocation.createSized(mRS, Element.F32(mRS), 4);
+
+        // setup the global output allocation
+        mScript.set_global_alloc(Allocation.createSized(mRS, Element.F32(mRS), 4));
+
+        // invoke static function 1
+        mScript.invoke_script_invoke_1(mAllocOut, mAllocIn1, mAllocIn2);
+
+        // invoke static function 2
+        mScript.invoke_script_invoke_2();
+
+        // invoke void kernel
+        Script.LaunchOptions options = new Script.LaunchOptions();
+        options.setX(0, 4);
+        mScript.forEach_void_kernel_1(options);
+    }
+}

diff --git a/tests/lldb/java/SingleSource/src/rs/rs_single_source.rscript b/tests/lldb/java/SingleSource/src/rs/rs_single_source.rscript
new file mode 100644
index 0000000..15c35ef
--- /dev/null
+++ b/tests/lldb/java/SingleSource/src/rs/rs_single_source.rscript

@@ -0,0 +1,72 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.singlesource)
+#pragma rs_fp_full
+
+// global allocation used for void kernel
+rs_allocation global_alloc;
+
+static void check_in()
+{
+    // debugger check point
+    return;
+}
+
+float __attribute__((kernel)) kernel_1(float a)
+{
+    // square
+    return a * a;
+}
+
+float __attribute__((kernel)) kernel_2(float a, float b)
+{
+    // product
+    return a * b;
+}
+
+void __attribute__((kernel)) void_kernel_1(uint32_t x)
+{
+    // allocation[x] = x
+    rsSetElementAt_float(global_alloc, (float)x, x);
+}
+
+void script_invoke_1(rs_allocation out, rs_allocation in1, rs_allocation in2)
+{
+    // invoke kernel taking one argument
+    rsForEach(kernel_1, out, in1);
+
+    check_in();
+
+    // invoke kernel taking two arguments
+    rsForEach(kernel_2, out, in1, in2);
+
+    check_in();
+}
+
+void script_invoke_2()
+{
+    // invoke kernel that takes no arguments and no return type
+    rs_script_call_t options = {
+        .strategy=RS_FOR_EACH_STRATEGY_DONT_CARE,
+        .xStart=0,
+        .xEnd=4
+    };
+    rsForEachWithOptions(void_kernel_1, &options);
+
+    check_in();
+}

diff --git a/tests/lldb/jni/Allocations/Android.mk b/tests/lldb/jni/Allocations/Android.mk
new file mode 100644
index 0000000..67ef2d9
--- /dev/null
+++ b/tests/lldb/jni/Allocations/Android.mk

@@ -0,0 +1,19 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := JNIAllocations
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+
+LOCAL_JNI_SHARED_LIBRARIES := libjniallocations
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
+
+include $(BUILD_PACKAGE)
+include $(LOCAL_PATH)/jniallocations/Android.mk

diff --git a/toolkit/test/AndroidManifest.xml b/tests/lldb/jni/Allocations/AndroidManifest.xml
similarity index 60%
copy from toolkit/test/AndroidManifest.xml
copy to tests/lldb/jni/Allocations/AndroidManifest.xml
index f709790..e73799f 100644
--- a/toolkit/test/AndroidManifest.xml
+++ b/tests/lldb/jni/Allocations/AndroidManifest.xml

@@ -1,17 +1,15 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-        package="com.example.testapp">
-
-    <application
-            android:allowBackup="true"
-            android:label="Toolkit Test"
-            android:supportsRtl="true">
-        <activity android:name=".MainActivity">
+    package="com.android.rs.jniallocations">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="JNIAllocations"
+                 android:hardwareAccelerated="true">
+        <activity android:name="MainActivity">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
-
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>
     </application>
 </manifest>
+

diff --git a/tests/lldb/jni/Allocations/jniallocations/Android.mk b/tests/lldb/jni/Allocations/jniallocations/Android.mk
new file mode 100644
index 0000000..e52e1a3
--- /dev/null
+++ b/tests/lldb/jni/Allocations/jniallocations/Android.mk

@@ -0,0 +1,15 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libjniallocations
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := jniallocations.cpp allocs.rscript
+
+LOCAL_RENDERSCRIPT_FLAGS := -g
+
+LOCAL_STATIC_LIBRARIES := libcompiler_rt
+
+include frameworks/rs/tests/lldb/jni/common.mk
+include $(BUILD_SHARED_LIBRARY)

diff --git a/tests/lldb/jni/Allocations/jniallocations/allocs.rscript b/tests/lldb/jni/Allocations/jniallocations/allocs.rscript
new file mode 100644
index 0000000..c5ebcbf
--- /dev/null
+++ b/tests/lldb/jni/Allocations/jniallocations/allocs.rscript

@@ -0,0 +1,47 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.jniallocations)
+
+// Kernel performs basic vector swizzle
+uchar4 __attribute__((kernel)) swizzle_kernel(uchar4 in)
+{
+    return in.wzyx;
+}
+
+// Kernel squares every element in allocation
+uint __attribute__((kernel)) square_kernel(ushort in)
+{
+    uint result = (uint)(in) * (uint)in;
+    return result;
+}
+
+// Helper function adding 1/2 to passed in double
+static double half_helper(double in)
+{
+    return (in + 0.5);
+}
+
+// Kernel returns first 3 elements of a double4 plus 1/2
+double3 __attribute__((kernel)) add_half_kernel(double4 in)
+{
+    double3 result;
+    result.x = half_helper(in.x);
+    result.y = half_helper(in.y);
+    result.z = half_helper(in.z);
+    return result;
+}

diff --git a/tests/lldb/jni/Allocations/jniallocations/jniallocations.cpp b/tests/lldb/jni/Allocations/jniallocations/jniallocations.cpp
new file mode 100644
index 0000000..f86b64c
--- /dev/null
+++ b/tests/lldb/jni/Allocations/jniallocations/jniallocations.cpp

@@ -0,0 +1,424 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <memory>
+
+#include <jni.h>
+#include <RenderScript.h>
+
+#include "ScriptC_allocs.h"
+
+sp<RS> mRS;
+
+sp<Allocation> mBoolAllocation;  // boolean
+
+sp<Allocation> mCharAllocation;  // char
+sp<Allocation> mChar2Allocation;   // char2
+sp<Allocation> mChar3Allocation;   // char3
+sp<Allocation> mChar4Allocation;   // char4
+
+sp<Allocation> mUCharAllocation;   // uchar
+sp<Allocation> mUChar2Allocation;  // uchar2
+sp<Allocation> mUChar3Allocation;  // uchar3
+sp<Allocation> mUChar4Allocation;  // uchar4
+
+sp<Allocation> mShortAllocation;   // short
+sp<Allocation> mShort2Allocation;  // short2
+sp<Allocation> mShort3Allocation;  // short3
+sp<Allocation> mShort4Allocation;  // short4
+
+sp<Allocation> mUShortAllocation;  // ushort
+sp<Allocation> mUShort2Allocation; // ushort2
+sp<Allocation> mUShort3Allocation; // ushort3
+sp<Allocation> mUShort4Allocation; // ushort4
+
+sp<Allocation> mIntAllocation;   // int
+sp<Allocation> mInt2Allocation;  // int2
+sp<Allocation> mInt3Allocation;  // int3
+sp<Allocation> mInt4Allocation;  // int4
+
+sp<Allocation> mUIntAllocation;  // uint
+sp<Allocation> mUInt2Allocation;   // uint2
+sp<Allocation> mUInt3Allocation;   // uint3
+sp<Allocation> mUInt4Allocation;   // uint4
+
+sp<Allocation> mLongAllocation;  // long
+sp<Allocation> mLong2Allocation;   // long2
+sp<Allocation> mLong3Allocation;   // long3
+sp<Allocation> mLong4Allocation;   // long4
+
+sp<Allocation> mULongAllocation;   // ulong
+sp<Allocation> mULong2Allocation;  // ulong2
+sp<Allocation> mULong3Allocation;  // ulong3
+sp<Allocation> mULong4Allocation;  // ulong4
+
+sp<Allocation> mHalfAllocation;  // half
+sp<Allocation> mHalf2Allocation;   // half2
+sp<Allocation> mHalf3Allocation;   // half3
+sp<Allocation> mHalf4Allocation;   // half4
+
+sp<Allocation> mFloatAllocation;   // float
+sp<Allocation> mFloat2Allocation;  // float2
+sp<Allocation> mFloat3Allocation;  // float3
+sp<Allocation> mFloat4Allocation;  // float4
+
+sp<Allocation> mDoubleAllocation;  // double
+sp<Allocation> mDouble2Allocation; // double2
+sp<Allocation> mDouble3Allocation; // double3
+sp<Allocation> mDouble4Allocation; // double4
+
+const int mAllocSize = 24; // Needs to be < CHAR_MAX and divisible by 4.
+const int mBitmapSize = 64;
+
+void createSignedAllocations() {
+    Type::Builder typeI8Builder(mRS, Element::I8(mRS));
+    typeI8Builder.setX(1); // One element here to test 16 byte memory alignment
+    typeI8Builder.setY(3);
+    typeI8Builder.setZ(8);
+
+    mCharAllocation = Allocation::createTyped(mRS, typeI8Builder.create());
+    mChar2Allocation = Allocation::createSized(mRS, Element::I8_2(mRS), mAllocSize / 2);
+    mChar3Allocation = Allocation::createSized(mRS, Element::I8_3(mRS), mAllocSize / 4);
+    mChar4Allocation = Allocation::createSized(mRS, Element::I8_4(mRS), mAllocSize / 4);
+
+    Type::Builder typeI16_2Builder(mRS, Element::I16_2(mRS));
+    typeI16_2Builder.setX(6);
+    typeI16_2Builder.setY(1);
+    typeI16_2Builder.setZ(2);
+
+    mShortAllocation = Allocation::createSized(mRS, Element::I16(mRS), mAllocSize);
+    mShort2Allocation = Allocation::createTyped(mRS, typeI16_2Builder.create());
+    mShort3Allocation = Allocation::createSized(mRS, Element::I16_3(mRS), mAllocSize / 4);
+    mShort4Allocation = Allocation::createSized(mRS, Element::I16_4(mRS), mAllocSize / 4);
+
+    Type::Builder typeI32_3Builder(mRS, Element::I32_3(mRS));
+    typeI32_3Builder.setX(3);
+    typeI32_3Builder.setY(2);
+
+    mIntAllocation = Allocation::createSized(mRS, Element::I32(mRS), mAllocSize);
+    mInt2Allocation = Allocation::createSized(mRS, Element::I32_2(mRS), mAllocSize / 2);
+    mInt3Allocation = Allocation::createTyped(mRS, typeI32_3Builder.create());
+    mInt4Allocation = Allocation::createSized(mRS, Element::I32_4(mRS), mAllocSize / 4);
+
+    Type::Builder typeI64_4Builder(mRS, Element::I64_4(mRS));
+    typeI64_4Builder.setX(1);
+    typeI64_4Builder.setY(6);
+
+    mLongAllocation = Allocation::createSized(mRS, Element::I64(mRS), mAllocSize);
+    mLong2Allocation = Allocation::createSized(mRS, Element::I64_2(mRS), mAllocSize / 2);
+    mLong3Allocation = Allocation::createSized(mRS, Element::I64_3(mRS), mAllocSize / 4);
+    mLong4Allocation = Allocation::createTyped(mRS, typeI64_4Builder.create());
+
+    mBoolAllocation = Allocation::createSized(mRS, Element::BOOLEAN(mRS), mAllocSize);
+}
+
+void initSignedAllocations() {
+    char *buffer_char = new char[mAllocSize];
+    short *buffer_short = new short[mAllocSize];
+    int *buffer_int = new int[mAllocSize];
+    int64_t *buffer_long = new int64_t[mAllocSize];
+    char *buffer_bool = new char[mAllocSize];
+
+    for(int i = 0; i < mAllocSize; ++i) {
+        buffer_char[i] = (char) i;
+        buffer_short[i] = (short) i;
+        buffer_int[i] = (int) i;
+        buffer_long[i] = (int64_t) i;
+        buffer_bool[i] =  (char) (0x01 & i);
+    }
+
+    mCharAllocation->copy3DRangeFrom(0, 0, 0, 1, 3, 8, buffer_char);
+    mChar2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_char);
+    mChar3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
+    mChar4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
+
+    delete [] buffer_char;
+
+    mShortAllocation->copy1DRangeFrom(0, mAllocSize, buffer_short);
+    mShort2Allocation->copy3DRangeFrom(0, 0, 0, 6, 1, 2, buffer_short);
+    mShort3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_short);
+    mShort4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_short);
+
+    delete [] buffer_short;
+
+    mIntAllocation->copy1DRangeFrom(0, mAllocSize, buffer_int);
+    mInt2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_int);
+    mInt3Allocation->copy2DRangeFrom(0, 0, 3, 2, buffer_int);
+    mInt4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_int);
+
+    delete [] buffer_int;
+
+    mLongAllocation->copy1DRangeFrom(0, mAllocSize, buffer_long);
+    mLong2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_long);
+    mLong3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_long);
+    mLong4Allocation->copy2DRangeFrom(0, 0, 1, 6, buffer_long);
+
+    delete [] buffer_long;
+
+    mBoolAllocation->copy1DRangeFrom(0, mAllocSize, buffer_bool);
+
+    delete [] buffer_bool;
+}
+
+void createUnsignedAllocations() {
+    Type::Builder typeU8_2Builder(mRS, Element::U8_2(mRS));
+    typeU8_2Builder.setX(2);
+    typeU8_2Builder.setY(6);
+
+    mUCharAllocation = Allocation::createSized(mRS, Element::U8(mRS), mAllocSize);
+    mUChar2Allocation = Allocation::createTyped(mRS, typeU8_2Builder.create());
+    mUChar3Allocation = Allocation::createSized(mRS, Element::U8_3(mRS), mAllocSize / 4);
+    mUChar4Allocation = Allocation::createSized(mRS, Element::U8_4(mRS), mAllocSize / 4);
+
+    Type::Builder typeU16_3Builder(mRS, Element::U16_3(mRS));
+    typeU16_3Builder.setX(1);
+    typeU16_3Builder.setY(6);
+
+    mUShortAllocation = Allocation::createSized(mRS, Element::U16(mRS), mAllocSize);
+    mUShort2Allocation = Allocation::createSized(mRS, Element::U16_2(mRS), mAllocSize / 2);
+    mUShort3Allocation = Allocation::createTyped(mRS, typeU16_3Builder.create());
+    mUShort4Allocation = Allocation::createSized(mRS, Element::U16_4(mRS), mAllocSize / 4);
+
+    Type::Builder typeU32_4Builder(mRS, Element::U32_4(mRS));
+    typeU32_4Builder.setX(1);
+    typeU32_4Builder.setY(1);
+    typeU32_4Builder.setZ(6);
+
+    mUIntAllocation = Allocation::createSized(mRS, Element::U32(mRS), mAllocSize);
+    mUInt2Allocation = Allocation::createSized(mRS, Element::U32_2(mRS), mAllocSize / 2);
+    mUInt3Allocation = Allocation::createSized(mRS, Element::U32_3(mRS), mAllocSize / 4);
+    mUInt4Allocation = Allocation::createTyped(mRS, typeU32_4Builder.create());
+
+    Type::Builder typeU64Builder(mRS, Element::U64(mRS));
+    typeU64Builder.setX(4);
+    typeU64Builder.setY(3);
+    typeU64Builder.setZ(2);
+
+    mULongAllocation = Allocation::createTyped(mRS, typeU64Builder.create());
+    mULong2Allocation = Allocation::createSized(mRS, Element::U64_2(mRS), mAllocSize / 2);
+    mULong3Allocation = Allocation::createSized(mRS, Element::U64_3(mRS), mAllocSize / 4);
+    mULong4Allocation = Allocation::createSized(mRS, Element::U64_4(mRS), mAllocSize / 4);
+}
+
+void initUnsignedAllocations() {
+    char *buffer_char = new char[mAllocSize];
+    short *buffer_short = new short[mAllocSize];
+    int *buffer_int = new int[mAllocSize];
+    uint64_t *buffer_long = new uint64_t[mAllocSize];
+
+    for(int i = 0; i < mAllocSize; ++i) {
+        buffer_char[i] = (char) i;
+        buffer_short[i] = (short) i;
+        buffer_int[i] = (int) i;
+        buffer_long[i] = (uint64_t) i;
+    }
+
+    mUCharAllocation->copy1DRangeFrom(0, mAllocSize, buffer_char);
+    mUChar2Allocation->copy2DRangeFrom(0, 0, 2, 6, buffer_char);
+    mUChar3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
+    mUChar4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
+
+    delete [] buffer_char;
+
+    mUShortAllocation->copy1DRangeFrom(0, mAllocSize, buffer_short);
+    mUShort2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_short);
+    mUShort3Allocation->copy2DRangeFrom(0, 0, 1, 6, buffer_short);
+    mUShort4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_short);
+
+    delete [] buffer_short;
+
+    mUIntAllocation->copy1DRangeFrom(0, mAllocSize, buffer_int);
+    mUInt2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_int);
+    mUInt3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_int);
+    mUInt4Allocation->copy3DRangeFrom(0, 0, 0, 1, 1, 6, buffer_int);
+
+    delete [] buffer_int;
+
+    mULongAllocation->copy3DRangeFrom(0, 0, 0, 4, 3, 2, buffer_long);
+    mULong2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_long);
+    mULong3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_long);
+    mULong4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_long);
+
+    delete [] buffer_long;
+}
+
+void createFloatAllocations() {
+    Type::Builder typeF16_3Builder(mRS, Element::F16_3(mRS));
+    typeF16_3Builder.setX(1);
+    typeF16_3Builder.setY(6);
+
+    mHalfAllocation = Allocation::createSized(mRS, Element::F16(mRS), mAllocSize);
+    mHalf2Allocation = Allocation::createSized(mRS, Element::F16_2(mRS), mAllocSize / 2);
+    mHalf3Allocation = Allocation::createTyped(mRS, typeF16_3Builder.create());
+    mHalf4Allocation = Allocation::createSized(mRS, Element::F16_4(mRS), mAllocSize / 4);
+
+    Type::Builder typeF32_4Builder(mRS, Element::F32_4(mRS));
+    typeF32_4Builder.setX(3);
+    typeF32_4Builder.setY(2);
+
+    mFloatAllocation = Allocation::createSized(mRS, Element::F32(mRS), mAllocSize);
+    mFloat2Allocation = Allocation::createSized(mRS, Element::F32_2(mRS), mAllocSize / 2);
+    mFloat3Allocation = Allocation::createSized(mRS, Element::F32_3(mRS), mAllocSize / 4);
+    mFloat4Allocation = Allocation::createTyped(mRS, typeF32_4Builder.create());
+
+    Type::Builder typeF64_2Builder(mRS, Element::F64_2(mRS));
+    typeF64_2Builder.setX(4);
+    typeF64_2Builder.setY(1);
+    typeF64_2Builder.setZ(3);
+
+    mDoubleAllocation = Allocation::createSized(mRS, Element::F64(mRS), mAllocSize);
+    mDouble2Allocation = Allocation::createTyped(mRS, typeF64_2Builder.create());
+
+    Type::Builder typeF64_3Builder(mRS, Element::F64_3(mRS));
+    typeF64_3Builder.setX(1);
+    typeF64_3Builder.setY(2);
+    typeF64_3Builder.setZ(3);
+
+    Type::Builder typeF64_4Builder(mRS, Element::F64_4(mRS));
+    typeF64_4Builder.setX(1);
+    typeF64_4Builder.setY(2);
+    typeF64_4Builder.setZ(3);
+
+    mDouble3Allocation = Allocation::createTyped(mRS, typeF64_3Builder.create());
+    mDouble4Allocation = Allocation::createTyped(mRS, typeF64_4Builder.create());
+}
+
+void initFloatAllocations() {
+    __fp16 *buffer_half = new __fp16[mAllocSize];
+    float *buffer_float = new float[mAllocSize];
+    double *buffer_double = new double[mAllocSize];
+
+    for(int i = 0; i < mAllocSize; ++i) {
+        buffer_half[i] = (__fp16) 1 / i;
+        buffer_float[i] = (float) 1 / i;
+        buffer_double[i] = (double) 1 / i;
+    }
+
+    mHalfAllocation->copy1DRangeFrom(0, mAllocSize, buffer_half);
+    mHalf2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_half);
+    mHalf3Allocation->copy2DRangeFrom(0, 0, 1, 6, buffer_half);
+    mHalf4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_half);
+
+    delete [] buffer_half;
+
+    mFloatAllocation->copy1DRangeFrom(0, mAllocSize, buffer_float);
+    mFloat2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_float);
+    mFloat3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_float);
+    mFloat4Allocation->copy2DRangeFrom(0, 0, 3, 2, buffer_float);
+
+    delete [] buffer_float;
+
+    mDoubleAllocation->copy1DRangeFrom(0, mAllocSize, buffer_double);
+    mDouble2Allocation->copy3DRangeFrom(0, 0, 0, 4, 1, 3, buffer_double);
+    mDouble3Allocation->copy3DRangeFrom(0, 0, 0, 1, 2, 3, buffer_double);
+    mDouble4Allocation->copy3DRangeFrom(0, 0, 0, 1, 2, 3, buffer_double);
+
+    delete [] buffer_double;
+}
+
+extern "C" void JNICALL
+Java_com_android_rs_jniallocations_MainActivity_nativeRS(
+	JNIEnv * env,
+	jclass,
+	jstring pathObj)
+{
+    mRS = new RS();
+
+    const char * path = env->GetStringUTFChars(pathObj, nullptr);
+    mRS->init(path, RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
+    env->ReleaseStringUTFChars(pathObj, path);
+
+    sp<ScriptC_allocs> mScript = new ScriptC_allocs(mRS);
+
+    Type::Builder typeRGBA_888Builder(mRS, Element::RGBA_8888(mRS));
+    typeRGBA_888Builder.setX(mBitmapSize);
+    typeRGBA_888Builder.setY(mBitmapSize);
+
+    sp<Allocation> mInAllocation = Allocation::createTyped(mRS, typeRGBA_888Builder.create());
+
+    const int image_area = mBitmapSize*mBitmapSize;
+    const int image_size = image_area*sizeof(int);
+
+    char *zero_buffer = new char[image_size];
+    memset(zero_buffer, 0, image_size);
+    mInAllocation->copy1DRangeFrom(0, image_area, zero_buffer);
+    delete [] zero_buffer;
+
+    sp<Allocation> mOutAllocation = Allocation::createTyped(mRS, typeRGBA_888Builder.create());
+    createSignedAllocations();
+    initSignedAllocations();
+
+    mRS->finish();
+    mScript->forEach_swizzle_kernel(mInAllocation, mOutAllocation);
+    mRS->finish();
+
+    mCharAllocation.clear();
+    mChar2Allocation.clear();
+    mChar3Allocation.clear();
+    mChar4Allocation.clear();
+
+    mShort2Allocation.clear();
+    mShort3Allocation.clear();
+    mShort4Allocation.clear();
+
+    mIntAllocation.clear();
+    mInt2Allocation.clear();
+    mInt3Allocation.clear();
+    mInt4Allocation.clear();
+
+    mLongAllocation.clear();
+    mLong2Allocation.clear();
+    mLong3Allocation.clear();
+    mLong4Allocation.clear();
+
+    mBoolAllocation.clear();
+
+    createUnsignedAllocations();
+    initUnsignedAllocations();
+
+    mInAllocation = mUShortAllocation; // Host side assignment
+
+    mRS->finish();
+    mScript->forEach_square_kernel(mInAllocation, mUIntAllocation);
+    mRS->finish();
+
+    mUCharAllocation.clear();
+    mUChar2Allocation.clear();
+    mUChar3Allocation.clear();
+    mUChar4Allocation.clear();
+
+    mUShortAllocation.clear();
+    mUShort2Allocation.clear();
+    mUShort3Allocation.clear();
+    mUShort4Allocation.clear();
+
+    mUInt2Allocation.clear();
+    mUInt3Allocation.clear();
+    mUInt4Allocation.clear();
+
+    mULongAllocation.clear();
+    mULong2Allocation.clear();
+    mULong3Allocation.clear();
+    mULong4Allocation.clear();
+
+    createFloatAllocations();
+    initFloatAllocations();
+
+    mRS->finish();
+    mScript->forEach_add_half_kernel(mDouble4Allocation, mDouble3Allocation);
+    mRS->finish();
+}
+

diff --git a/tests/lldb/jni/Allocations/res/layout/main_layout.xml b/tests/lldb/jni/Allocations/res/layout/main_layout.xml
new file mode 100644
index 0000000..131c3b5
--- /dev/null
+++ b/tests/lldb/jni/Allocations/res/layout/main_layout.xml

@@ -0,0 +1,15 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>
+

diff --git a/tests/lldb/jni/Allocations/src/com/android/rs/jniallocations/MainActivity.java b/tests/lldb/jni/Allocations/src/com/android/rs/jniallocations/MainActivity.java
new file mode 100644
index 0000000..f13682f
--- /dev/null
+++ b/tests/lldb/jni/Allocations/src/com/android/rs/jniallocations/MainActivity.java

@@ -0,0 +1,43 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.jniallocations;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.graphics.BitmapFactory;
+import android.graphics.Bitmap;
+import android.widget.ImageView;
+
+public class MainActivity extends Activity {
+    private Bitmap mBitmapIn;
+    private Bitmap mBitmapOut;
+
+    static {
+        System.loadLibrary("RS");
+        System.loadLibrary("jniallocations");
+    }
+
+    native void nativeRS(String cacheDir);
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.main_layout);
+        nativeRS(this.getCacheDir().toString());
+    }
+}
+

diff --git a/tests/lldb/jni/Android.mk b/tests/lldb/jni/Android.mk
new file mode 100644
index 0000000..5053e7d
--- /dev/null
+++ b/tests/lldb/jni/Android.mk

@@ -0,0 +1 @@
+include $(call all-subdir-makefiles)

diff --git a/tests/lldb/jni/BranchingFunCalls/Android.mk b/tests/lldb/jni/BranchingFunCalls/Android.mk
new file mode 100644
index 0000000..a5ee3b4
--- /dev/null
+++ b/tests/lldb/jni/BranchingFunCalls/Android.mk

@@ -0,0 +1,19 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := JNIBranchingFunCalls
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+
+LOCAL_JNI_SHARED_LIBRARIES := libjnibranchingfuncalls
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
+
+include $(BUILD_PACKAGE)
+include $(LOCAL_PATH)/jnibranchingfuncalls/Android.mk

diff --git a/tests/lldb/jni/BranchingFunCalls/AndroidManifest.xml b/tests/lldb/jni/BranchingFunCalls/AndroidManifest.xml
new file mode 100644
index 0000000..3b616fa
--- /dev/null
+++ b/tests/lldb/jni/BranchingFunCalls/AndroidManifest.xml

@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.android.rs.jnibranchingfuncalls">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="JNIBranchingFunCalls"
+                 android:hardwareAccelerated="true">
+        <activity android:name="MainActivity">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+</manifest>
+

diff --git a/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/Android.mk b/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/Android.mk
new file mode 100644
index 0000000..69f9162
--- /dev/null
+++ b/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/Android.mk

@@ -0,0 +1,13 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libjnibranchingfuncalls
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := jnibranchingfuncalls.cpp scalars.rscript
+
+LOCAL_RENDERSCRIPT_FLAGS := -g
+
+include frameworks/rs/tests/lldb/jni/common.mk
+include $(BUILD_SHARED_LIBRARY)

diff --git a/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/jnibranchingfuncalls.cpp b/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/jnibranchingfuncalls.cpp
new file mode 100644
index 0000000..4e2c4cf
--- /dev/null
+++ b/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/jnibranchingfuncalls.cpp

@@ -0,0 +1,62 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <memory>
+
+#include <jni.h>
+#include <RenderScript.h>
+
+#include "ScriptC_scalars.h"
+
+extern "C" void JNICALL
+Java_com_android_rs_jnibranchingfuncalls_MainActivity_nativeRS(
+    JNIEnv * env,
+    jclass,
+    jstring pathObj)
+{
+    static const int size = 64;
+    sp<RS> rs = new RS();
+
+    const char * path = env->GetStringUTFChars(pathObj, nullptr);
+    rs->init(path, RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
+    env->ReleaseStringUTFChars(pathObj, path);
+
+    auto e = Element::I32(rs);
+    Type::Builder tb(rs, e);
+    tb.setX(size);
+    tb.setY(size);
+    auto t = tb.create();
+
+    auto a = Allocation::createTyped(rs, t);
+    auto b = Allocation::createTyped(rs, t);
+
+    int * input = new int[size*size];
+    for(int i = 0; i < size*size; ++i) {
+        input[i] = i - (size*size / 2);
+    }
+    a->copy2DRangeFrom(0, 0, size, size, input);
+    delete [] input;
+
+    // Script is executed once, then the data is copied back when finished
+    sp<ScriptC_scalars> s = new ScriptC_scalars(rs);
+    s->invoke_addToGlobal(234);
+    s->forEach_simple_kernel(a, b);
+    rs->finish();
+    int32_t * output = new int32_t[size*size];
+    b->copy2DRangeTo(0, 0, size, size, output);
+    delete [] output;
+}
+

diff --git a/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/scalars.rscript b/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/scalars.rscript
new file mode 100644
index 0000000..b98df28
--- /dev/null
+++ b/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/scalars.rscript

@@ -0,0 +1,76 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.jnibranchingfuncalls)
+
+static bool is_neg(int a)
+{
+    if(a < 0)
+        return true;
+    else
+        return false;
+}
+
+static bool is_pos(int a)
+{
+    if(a > 0)
+        return true;
+    else
+        return false;
+}
+
+static void set_i(int * a, int b)
+{
+    int tmp = b;
+    *a = tmp;
+}
+
+static void modify_f(float * f)
+{
+    *f *= 0.5f;
+}
+
+static void modify_i(int * i)
+{
+    int j = *i;
+    int cutoff = 2 << 6;
+    if(j > cutoff)
+        j = cutoff;
+    if(is_neg(j))
+        set_i(i, 0);
+    else if(is_pos(j))
+        set_i(i, j);
+    else
+        set_i(i, cutoff);
+}
+
+int __attribute__((kernel)) simple_kernel(int in)
+{
+    int i = in;
+    float f = (float) i;
+    modify_f(&f);
+    modify_i(&i);
+    int ret = (int) f;
+    return in * ret;
+}
+
+int glob = 123;
+
+void addToGlobal(int arg)
+{
+    glob += arg;
+}

diff --git a/tests/lldb/jni/BranchingFunCalls/res/layout/main_layout.xml b/tests/lldb/jni/BranchingFunCalls/res/layout/main_layout.xml
new file mode 100644
index 0000000..131c3b5
--- /dev/null
+++ b/tests/lldb/jni/BranchingFunCalls/res/layout/main_layout.xml

@@ -0,0 +1,15 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>
+

diff --git a/tests/lldb/jni/BranchingFunCalls/src/com/android/rs/jnibranchingfuncalls/MainActivity.java b/tests/lldb/jni/BranchingFunCalls/src/com/android/rs/jnibranchingfuncalls/MainActivity.java
new file mode 100644
index 0000000..b0ac283
--- /dev/null
+++ b/tests/lldb/jni/BranchingFunCalls/src/com/android/rs/jnibranchingfuncalls/MainActivity.java

@@ -0,0 +1,43 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.jnibranchingfuncalls;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.graphics.BitmapFactory;
+import android.graphics.Bitmap;
+import android.widget.ImageView;
+
+public class MainActivity extends Activity {
+    private Bitmap mBitmapIn;
+    private Bitmap mBitmapOut;
+
+    static {
+        System.loadLibrary("RS");
+        System.loadLibrary("jnibranchingfuncalls");
+    }
+
+    native void nativeRS(String cacheDir);
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.main_layout);
+        nativeRS(this.getCacheDir().toString());
+    }
+}
+

diff --git a/tests/lldb/jni/DebugWaitAttach/Android.mk b/tests/lldb/jni/DebugWaitAttach/Android.mk
new file mode 100644
index 0000000..14cf20d
--- /dev/null
+++ b/tests/lldb/jni/DebugWaitAttach/Android.mk

@@ -0,0 +1,19 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := JNIDebugWaitAttach
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+
+LOCAL_JNI_SHARED_LIBRARIES := libjnidebugwaitattach
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
+
+include $(BUILD_PACKAGE)
+include $(LOCAL_PATH)/jnidebugwaitattach/Android.mk

diff --git a/tests/lldb/jni/DebugWaitAttach/AndroidManifest.xml b/tests/lldb/jni/DebugWaitAttach/AndroidManifest.xml
new file mode 100644
index 0000000..12e544b
--- /dev/null
+++ b/tests/lldb/jni/DebugWaitAttach/AndroidManifest.xml

@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.android.rs.jnidebugwaitattach">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="JNIDebugWaitAttach"
+                 android:hardwareAccelerated="true">
+
+        <activity android:name="MainActivity">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+</manifest>

diff --git a/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/Android.mk b/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/Android.mk
new file mode 100644
index 0000000..af7d578
--- /dev/null
+++ b/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/Android.mk

@@ -0,0 +1,13 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libjnidebugwaitattach
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := jnidebugwaitattach.cpp simple.rscript
+
+LOCAL_RENDERSCRIPT_FLAGS := -g
+
+include frameworks/rs/tests/lldb/jni/common.mk
+include $(BUILD_SHARED_LIBRARY)

diff --git a/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/jnidebugwaitattach.cpp b/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/jnidebugwaitattach.cpp
new file mode 100644
index 0000000..f8151f4
--- /dev/null
+++ b/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/jnidebugwaitattach.cpp

@@ -0,0 +1,57 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <memory>
+
+#include <jni.h>
+#include <RenderScript.h>
+
+#include "ScriptC_simple.h"
+
+extern "C" void JNICALL
+Java_com_android_rs_jnidebugwaitattach_MainActivity_nativeRS(
+    JNIEnv * env,
+    jclass,
+    jstring pathObj)
+{
+    static const int size = 8;
+    sp<RS> rs = new RS();
+
+    const char * path = env->GetStringUTFChars(pathObj, nullptr);
+    rs->init(path, RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
+    env->ReleaseStringUTFChars(pathObj, path);
+
+    auto e = Element::RGBA_8888(rs);
+    Type::Builder tb(rs, e);
+    tb.setX(size);
+    tb.setY(size);
+    auto t = tb.create();
+
+    auto a = Allocation::createTyped(rs, t);
+    auto b = Allocation::createTyped(rs, t);
+
+    // Script is executed once, then the data is copied back when finished
+    sp<ScriptC_simple> s = new ScriptC_simple(rs);
+    s->forEach_simple_kernel(a, b);
+    uint32_t * output = new uint32_t[size*size];
+    b->copy2DRangeTo(0, 0, size, size, output);
+    delete [] output;
+
+    s->forEach_other_kernel(a, b);
+
+    rs->finish();
+}
+

diff --git a/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/simple.rscript b/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/simple.rscript
new file mode 100644
index 0000000..a89c1f2
--- /dev/null
+++ b/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/simple.rscript

@@ -0,0 +1,39 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.jnidebugwaitattach)
+
+float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
+
+/* RenderScript kernel that just sets the colour of the screen and does some
+ * simple operations so it is not completely empty
+ * (and can therefore be debugged).
+ */
+uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
+{
+    float4 temp = rsUnpackColor8888(in);
+    temp = gColor;
+    uchar4 result = rsPackColorTo8888(temp);
+    return result;
+}
+
+// Extra kernel to test lldb setting breakpoints on all the RS kernels.
+uchar4 __attribute__((kernel)) other_kernel(uchar4 in)
+{
+    uchar4 result = in.wzyx;
+    return result;
+}

diff --git a/tests/lldb/jni/DebugWaitAttach/res/layout/main_layout.xml b/tests/lldb/jni/DebugWaitAttach/res/layout/main_layout.xml
new file mode 100644
index 0000000..131c3b5
--- /dev/null
+++ b/tests/lldb/jni/DebugWaitAttach/res/layout/main_layout.xml

@@ -0,0 +1,15 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>
+

diff --git a/tests/lldb/jni/DebugWaitAttach/src/com/android/rs/jniwaitattachdebug/MainActivity.java b/tests/lldb/jni/DebugWaitAttach/src/com/android/rs/jniwaitattachdebug/MainActivity.java
new file mode 100644
index 0000000..b858cf7
--- /dev/null
+++ b/tests/lldb/jni/DebugWaitAttach/src/com/android/rs/jniwaitattachdebug/MainActivity.java

@@ -0,0 +1,43 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.jnidebugwaitattach;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.graphics.BitmapFactory;
+import android.graphics.Bitmap;
+import android.widget.ImageView;
+
+public class MainActivity extends Activity {
+    private Bitmap mBitmapIn;
+    private Bitmap mBitmapOut;
+
+    static {
+        System.loadLibrary("RS");
+        System.loadLibrary("jnidebugwaitattach");
+    }
+
+    native void nativeRS(String cacheDir);
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.main_layout);
+        nativeRS(this.getCacheDir().toString());
+    }
+}
+

diff --git a/tests/lldb/jni/InfiniteLoop/Android.mk b/tests/lldb/jni/InfiniteLoop/Android.mk
new file mode 100644
index 0000000..892e1e9
--- /dev/null
+++ b/tests/lldb/jni/InfiniteLoop/Android.mk

@@ -0,0 +1,19 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := JNIInfiniteLoop
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+
+LOCAL_JNI_SHARED_LIBRARIES := libjniinfiniteloop
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
+
+include $(BUILD_PACKAGE)
+include $(LOCAL_PATH)/jniinfiniteloop/Android.mk

diff --git a/toolkit/test/AndroidManifest.xml b/tests/lldb/jni/InfiniteLoop/AndroidManifest.xml
similarity index 60%
copy from toolkit/test/AndroidManifest.xml
copy to tests/lldb/jni/InfiniteLoop/AndroidManifest.xml
index f709790..23031f0 100644
--- a/toolkit/test/AndroidManifest.xml
+++ b/tests/lldb/jni/InfiniteLoop/AndroidManifest.xml

@@ -1,17 +1,15 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-        package="com.example.testapp">
-
-    <application
-            android:allowBackup="true"
-            android:label="Toolkit Test"
-            android:supportsRtl="true">
-        <activity android:name=".MainActivity">
+    package="com.android.rs.jniinfiniteloop">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="JNIInfiniteLoop"
+                 android:hardwareAccelerated="true">
+        <activity android:name="MainActivity">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
-
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>
     </application>
 </manifest>
+

diff --git a/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/Android.mk b/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/Android.mk
new file mode 100644
index 0000000..2171722
--- /dev/null
+++ b/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/Android.mk

@@ -0,0 +1,13 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libjniinfiniteloop
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := jniinfiniteloop.cpp infiniteloop.rscript
+
+LOCAL_RENDERSCRIPT_FLAGS := -g
+
+include frameworks/rs/tests/lldb/jni/common.mk
+include $(BUILD_SHARED_LIBRARY)

diff --git a/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/infiniteloop.rscript b/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/infiniteloop.rscript
new file mode 100644
index 0000000..142b27b
--- /dev/null
+++ b/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/infiniteloop.rscript

@@ -0,0 +1,49 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.jniinfiniteloop)
+#pragma rs_fp_relaxed
+
+float4 gColour = {0.299f, 0.587f, 0.114f, 1.f};
+
+/* RenderScript kernel that just sets the colour of the screen and does some
+ * simple operations so it is not completely empty
+ * (and can therefore be debugged).
+ */
+uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
+{
+    float4 temp = rsUnpackColor8888(in);
+    temp = gColour;
+    uchar4 result = rsPackColorTo8888(temp);
+    return result;
+}

diff --git a/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/jniinfiniteloop.cpp b/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/jniinfiniteloop.cpp
new file mode 100644
index 0000000..73d1cbb
--- /dev/null
+++ b/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/jniinfiniteloop.cpp

@@ -0,0 +1,62 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <memory>
+#include <unistd.h>
+
+#include <jni.h>
+#include <RenderScript.h>
+
+#include "ScriptC_infiniteloop.h"
+
+extern "C" void JNICALL
+Java_com_android_rs_jniinfiniteloop_MainActivity_nativeRS(
+    JNIEnv * env,
+    jclass,
+    jstring pathObj)
+{
+    static const int size = 64;
+    sp<RS> rs = new RS();
+
+    const char * path = env->GetStringUTFChars(pathObj, nullptr);
+    rs->init(path, RS_INIT_LOW_LATENCY);
+    env->ReleaseStringUTFChars(pathObj, path);
+
+    auto e = Element::RGBA_8888(rs);
+    Type::Builder tb(rs, e);
+    tb.setX(size);
+    tb.setY(size);
+    auto t = tb.create();
+
+    auto a = Allocation::createTyped(rs, t);
+    auto b = Allocation::createTyped(rs, t);
+
+    sp<ScriptC_infiniteloop> s = new ScriptC_infiniteloop(rs);
+
+    // Test is designed to loop forever, waits for two seconds
+    // between each invocation of the kernel
+    bool forever = true;
+    while(forever)
+    {
+        s->forEach_simple_kernel(a, b);
+        sleep(2);
+    }
+
+    uint32_t * output = new uint32_t[size*size];
+    b->copy2DRangeTo(0, 0, size, size, output);
+    delete [] output;
+}
+

diff --git a/tests/lldb/jni/InfiniteLoop/res/layout/main_layout.xml b/tests/lldb/jni/InfiniteLoop/res/layout/main_layout.xml
new file mode 100644
index 0000000..131c3b5
--- /dev/null
+++ b/tests/lldb/jni/InfiniteLoop/res/layout/main_layout.xml

@@ -0,0 +1,15 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>
+

diff --git a/tests/lldb/jni/InfiniteLoop/src/com/android/rs/jniinfiniteloop/MainActivity.java b/tests/lldb/jni/InfiniteLoop/src/com/android/rs/jniinfiniteloop/MainActivity.java
new file mode 100644
index 0000000..a18c420
--- /dev/null
+++ b/tests/lldb/jni/InfiniteLoop/src/com/android/rs/jniinfiniteloop/MainActivity.java

@@ -0,0 +1,43 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.jniinfiniteloop;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.graphics.BitmapFactory;
+import android.graphics.Bitmap;
+import android.widget.ImageView;
+
+public class MainActivity extends Activity {
+    private Bitmap mBitmapIn;
+    private Bitmap mBitmapOut;
+
+    static {
+        System.loadLibrary("RS");
+        System.loadLibrary("jniinfiniteloop");
+    }
+
+    native void nativeRS(String cacheDir);
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.main_layout);
+        nativeRS(this.getCacheDir().toString());
+    }
+}
+

diff --git a/tests/lldb/jni/KernelVariables/Android.mk b/tests/lldb/jni/KernelVariables/Android.mk
new file mode 100644
index 0000000..12017d5
--- /dev/null
+++ b/tests/lldb/jni/KernelVariables/Android.mk

@@ -0,0 +1,19 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := JNIKernelVariables
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+
+LOCAL_JNI_SHARED_LIBRARIES := libjnikernelvariables
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
+
+include $(BUILD_PACKAGE)
+include $(LOCAL_PATH)/jnikernelvariables/Android.mk

diff --git a/tests/lldb/jni/KernelVariables/AndroidManifest.xml b/tests/lldb/jni/KernelVariables/AndroidManifest.xml
new file mode 100644
index 0000000..b719892
--- /dev/null
+++ b/tests/lldb/jni/KernelVariables/AndroidManifest.xml

@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.android.rs.jnikernelvariables">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="JNIKernelVariables"
+                 android:hardwareAccelerated="true">
+        <activity android:name="MainActivity">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+</manifest>
+

diff --git a/tests/lldb/jni/KernelVariables/jnikernelvariables/Android.mk b/tests/lldb/jni/KernelVariables/jnikernelvariables/Android.mk
new file mode 100644
index 0000000..10afc4b
--- /dev/null
+++ b/tests/lldb/jni/KernelVariables/jnikernelvariables/Android.mk

@@ -0,0 +1,13 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libjnikernelvariables
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := jnikernelvariables.cpp simple.rscript
+
+LOCAL_RENDERSCRIPT_FLAGS := -g
+
+include frameworks/rs/tests/lldb/jni/common.mk
+include $(BUILD_SHARED_LIBRARY)

diff --git a/tests/lldb/jni/KernelVariables/jnikernelvariables/jnikernelvariables.cpp b/tests/lldb/jni/KernelVariables/jnikernelvariables/jnikernelvariables.cpp
new file mode 100644
index 0000000..94917bf
--- /dev/null
+++ b/tests/lldb/jni/KernelVariables/jnikernelvariables/jnikernelvariables.cpp

@@ -0,0 +1,94 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <memory>
+
+#include <jni.h>
+#include <RenderScript.h>
+
+#include "ScriptC_simple.h"
+
+extern "C" void JNICALL
+Java_com_android_rs_jnikernelvariables_MainActivity_nativeRS(
+    JNIEnv * env,
+    jclass,
+    jstring pathObj)
+{
+    static const int size = 64;
+    sp<RS> rs = new RS();
+
+    const char * path = env->GetStringUTFChars(pathObj, nullptr);
+    rs->init(path, RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
+    env->ReleaseStringUTFChars(pathObj, path);
+
+    auto e = Element::RGBA_8888(rs);
+    Type::Builder tb(rs, e);
+    tb.setX(size);
+    tb.setY(size);
+    auto t = tb.create();
+
+    auto a = Allocation::createTyped(rs, t);
+    auto b = Allocation::createTyped(rs, t);
+
+    sp<ScriptC_simple> s = new ScriptC_simple(rs);
+
+    static const int buffer_int[] = {1, 2, 3, 4};
+    sp<Allocation> int_allocation = Allocation::createSized(rs, Element::I32(rs), 4);
+    int_allocation->copy1DRangeFrom(0, 4, buffer_int);
+    s->set_allocation_1D_global(int_allocation);
+
+    static const int buffer_int2[] = {5, 6, 7, 8};
+
+    Type::Builder typeI32Builder2D(rs, Element::I32(rs));
+    typeI32Builder2D.setX(2);
+    typeI32Builder2D.setY(2);
+
+    sp<Allocation> int_allocation2 = Allocation::createTyped(rs, typeI32Builder2D.create());
+    int_allocation2->copy2DRangeFrom(0, 0, 2, 2, buffer_int2);
+    s->set_allocation_1D_global2(int_allocation2);
+
+    s->set_allocation_2D_global(a);
+    s->set_allocation_2D_global2(b);
+
+    static const int buffer_int3[] = {9, 10, 11, 12, 13, 14, 15, 16};
+
+    Type::Builder typeI32Builder3D(rs, Element::I32(rs));
+    typeI32Builder3D.setX(2);
+    typeI32Builder3D.setY(2);
+    typeI32Builder3D.setZ(2);
+
+    sp<Allocation> int_allocation3 = Allocation::createTyped(rs, typeI32Builder3D.create());
+    int_allocation3->copy3DRangeFrom(0, 0, 0, 2, 2, 2, buffer_int3);
+    s->set_allocation_3D_global(int_allocation3);
+
+    Type::Builder yuvTypeBuilder(rs, Element::YUV(rs));
+    yuvTypeBuilder.setX(4);
+    yuvTypeBuilder.setY(4);
+    yuvTypeBuilder.setYuvFormat(RS_YUV_YV12);
+
+    sp<Allocation> yuv_allocation = Allocation::createTyped(rs, yuvTypeBuilder.create());
+    s->set_allocation_YUV_2D_global(yuv_allocation);
+
+    s->set_sampler_global(Sampler::CLAMP_LINEAR(rs));
+
+    // Script is executed once, then the data is copied back when finished
+    s->forEach_kernel(a, b);
+    rs->finish();
+    uint32_t * output = new uint32_t[size*size];
+    b->copy2DRangeTo(0, 0, size, size, output);
+    delete [] output;
+}
+

diff --git a/tests/lldb/jni/KernelVariables/jnikernelvariables/simple.rscript b/tests/lldb/jni/KernelVariables/jnikernelvariables/simple.rscript
new file mode 100644
index 0000000..30feb00
--- /dev/null
+++ b/tests/lldb/jni/KernelVariables/jnikernelvariables/simple.rscript

@@ -0,0 +1,197 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.jnikernelvariables)
+
+char char_global = 12;
+uchar uchar_global = 234;
+short short_global = -321;
+ushort ushort_global = 432;
+int int_global = 1234;
+uint uint_global = 2345;
+float float_global = 4.5f;
+long long_global = -77777;
+ulong ulong_global = 8888;
+double double_global = -456.5f;
+
+char2 char2_global = {11, -22};
+uchar2 uchar2_global = {33, 44};
+short2 short2_global = {-555, 666};
+ushort2 ushort2_global = {777, 888};
+int2 int2_global = {999, -1111};
+uint2 uint2_global = {2222, 3333};
+float2 float2_global = {4.5f, -5.0f};
+long2 long2_global = {-4444, 5555};
+ulong2 ulong2_global = {6666, 7777};
+double2 double2_global = {88.5f, -99.0f};
+
+char3 char3_global = {11, -22, -33};
+uchar3 uchar3_global = {33, 44, 55};
+short3 short3_global = {-555, 666, 777};
+ushort3 ushort3_global = {777, 888, 999};
+int3 int3_global = {999, -1111, 2222};
+uint3 uint3_global = {2222, 3333, 4444};
+float3 float3_global = {4.5f, -5.0f, -6.5f};
+long3 long3_global = {-4444, 5555, 6666};
+ulong3 ulong3_global = {6666, 7777, 8888};
+double3 double3_global = {88.5f, -99.0f, 111.5f};
+
+char4 char4_global = {55, 11, -22, -33};
+uchar4 uchar4_global = {222, 33, 44, 55};
+short4 short4_global = {-444, -555, 666, 777};
+ushort4 ushort4_global = {666, 777, 888, 999};
+int4 int4_global = {888, 999, -1111, 2222};
+uint4 uint4_global = {1111, 2222, 3333, 4444};
+float4 float4_global = {3.0f, 4.5f, -5.0f, -6.5f};
+long4 long4_global = {-3333, -4444, 5555, 6666};
+ulong4 ulong4_global = {5555, 6666, 7777, 8888};
+double4 double4_global = {-77.0f, 88.5f, -99.0f, 111.5f};
+
+rs_matrix2x2 matrix2x2_global;
+rs_matrix3x3 matrix3x3_global;
+rs_matrix4x4 matrix4x4_global;
+
+rs_quaternion quaternion_global;
+
+rs_allocation allocation_1D_global;
+rs_allocation allocation_1D_global2;
+rs_allocation allocation_2D_global;
+rs_allocation allocation_2D_global2;
+rs_allocation allocation_3D_global;
+rs_allocation allocation_YUV_2D_global;
+
+rs_allocation_cubemap_face cubemap_face_global;
+rs_sampler sampler_global;
+
+uchar4 __attribute__((kernel)) kernel(uchar4 in)
+{
+    char char_local = 'a';
+    uchar uchar_local = 'b';
+    short short_local = -321;
+    ushort ushort_local = 432;
+    int int_local = 1234;
+    uint uint_local = 2345;
+    float float_local = 4.5f;
+    long long_local = -77777;
+    ulong ulong_local = 8888;
+    double double_local = -456.5f;
+
+    char2 char2_local = {-11, -22};
+    uchar2 uchar2_local = {33, 44};
+    short2 short2_local = {-555, 666};
+    ushort2 ushort2_local = {777, 888};
+    int2 int2_local = {999, -1111};
+    uint2 uint2_local = {2222, 3333};
+    float2 float2_local = {4.5f, -5.0f};
+    long2 long2_local = {-4444, 5555};
+    ulong2 ulong2_local = {6666, 7777};
+    double2 double2_local = {88.5f, -99.0f};
+
+    char3 char3_local = {11, -22, -33};
+    uchar3 uchar3_local = {33, 44, 55};
+    short3 short3_local = {-555, 666, 777};
+    ushort3 ushort3_local = {777, 888, 999};
+    int3 int3_local = {999, -1111, 2222};
+    uint3 uint3_local = {2222, 3333, 4444};
+    float3 float3_local = {4.5f, -5.0f, -6.5f};
+    long3 long3_local = {-4444, 5555, 6666};
+    ulong3 ulong3_local = {6666, 7777, 8888};
+    double3 double3_local = {88.5f, -99.0f, 111.5f};
+
+    char4 char4_local = {55, 11, -22, -33};
+    uchar4 uchar4_local = {22, 33, 44, 55};
+    short4 short4_local = {-444, -555, 666, 777};
+    ushort4 ushort4_local = {666, 777, 888, 999};
+    int4 int4_local = {888, 999, -1111, 2222};
+    uint4 uint4_local = {1111, 2222, 3333, 4444};
+    float4 float4_local = {3.0f, 4.5f, -5.0f, -6.5f};
+    long4 long4_local = {-3333, -4444, 5555, 6666};
+    ulong4 ulong4_local = {5555, 6666, 7777, 8888};
+    double4 double4_local = {-77.0f, 88.5f, -99.0f, 111.5f};
+
+    rs_matrix2x2 matrix2x2_local = {{1., 2.5,
+                                     3., 4.5}};
+    rs_matrix3x3 matrix3x3_local = {{5., 6.5, 7.,
+                                     8.5, 9., 1.5,
+                                     2., 3.5, 4.}};
+    rs_matrix4x4 matrix4x4_local = {{5.5, 6., 7.5, 8.,
+                                     9., 1.5, 2., 3.5,
+                                     4.5, 5.5, 6.5, 7.,
+                                     8., 9.5, 1.5, 2.5}};
+
+    matrix2x2_global = matrix2x2_local;
+    matrix3x3_global = matrix3x3_local;
+    matrix4x4_global = matrix4x4_local;
+
+    rsQuaternionSet(&quaternion_global, 3.0, 4.5, 5.5, 6.0);
+
+    rs_quaternion quaternion_local;
+    rsQuaternionSet(&quaternion_local, 7.5, 8.0, 9.0, 0.5);
+
+    char char_combined = char_local + (char)uchar_local + char2_local.x +
+        (char)uchar2_local.x + char3_local.x - (char)uchar3_local.x +
+        char4_local.x + (char)uchar4_local.x;
+
+    short short_combined = short_local + (short)ushort_local + short2_local.x +
+        (short)ushort2_local.x + short3_local.x + (short)ushort3_local.x +
+        short4_local.x + (short)ushort4_local.x;
+
+    int int_combined = int_local + (int)uint_local + int2_local.x +
+        (int)uint2_local.x + int3_local.x + (int)uint3_local.x + int4_local.x +
+        (int)uint4_local.x;
+
+    float float_combined = float_local + float2_local.x + float3_local.x +
+        float4_local.x;
+
+    long long_combined = long_local + (long)ulong_local + long2_local.x +
+        (long)ulong2_local.x + long3_local.x + (long)ulong3_local.x +
+        long4_local.x + (long)ulong4_local.x;
+
+    double double_combined = double_local + double2_local.x + double3_local.x +
+        double4_local.x;
+
+    char_global = char_combined;
+    short_global = short_combined;
+    int_global = int_combined;
+    float_global = float_combined;
+    long_global = long_combined;
+    double_global = double_combined;
+
+    uchar4 result = {1,2,3,4};
+    return result;
+}
+
+float use_constants_global;
+
+void setup(void)
+{
+  use_constants_global =
+      M_1_PI +
+      M_2_PI +
+      M_2_PIl +
+      M_2_SQRTPI +
+      M_E +
+      M_LN10 +
+      M_LN2 +
+      M_LOG10E +
+      M_LOG2E +
+      M_PI +
+      M_PI_2 +
+      M_PI_4 +
+      M_SQRT1_2 +
+      M_SQRT2;
+}

diff --git a/tests/lldb/jni/KernelVariables/res/layout/main_layout.xml b/tests/lldb/jni/KernelVariables/res/layout/main_layout.xml
new file mode 100644
index 0000000..131c3b5
--- /dev/null
+++ b/tests/lldb/jni/KernelVariables/res/layout/main_layout.xml

@@ -0,0 +1,15 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>
+

diff --git a/tests/lldb/jni/KernelVariables/src/com/android/rs/jnikernelvariables/MainActivity.java b/tests/lldb/jni/KernelVariables/src/com/android/rs/jnikernelvariables/MainActivity.java
new file mode 100644
index 0000000..11e41f2
--- /dev/null
+++ b/tests/lldb/jni/KernelVariables/src/com/android/rs/jnikernelvariables/MainActivity.java

@@ -0,0 +1,42 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.jnikernelvariables;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.graphics.BitmapFactory;
+import android.graphics.Bitmap;
+import android.widget.ImageView;
+
+public class MainActivity extends Activity {
+    private Bitmap mBitmapIn;
+    private Bitmap mBitmapOut;
+
+    static {
+        System.loadLibrary("RS");
+        System.loadLibrary("jnikernelvariables");
+    }
+
+    native void nativeRS(String cacheDir);
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.main_layout);
+        nativeRS(this.getCacheDir().toString());
+    }
+}

diff --git a/tests/lldb/jni/MultipleRSFiles/Android.mk b/tests/lldb/jni/MultipleRSFiles/Android.mk
new file mode 100644
index 0000000..ff24154
--- /dev/null
+++ b/tests/lldb/jni/MultipleRSFiles/Android.mk

@@ -0,0 +1,19 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := JNIMultipleRSFiles
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+
+LOCAL_JNI_SHARED_LIBRARIES := libjnimultiplersfiles
+
+LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
+
+include $(BUILD_PACKAGE)
+include $(LOCAL_PATH)/jnimultiplersfiles/Android.mk

diff --git a/tests/lldb/jni/MultipleRSFiles/AndroidManifest.xml b/tests/lldb/jni/MultipleRSFiles/AndroidManifest.xml
new file mode 100644
index 0000000..19bb65c
--- /dev/null
+++ b/tests/lldb/jni/MultipleRSFiles/AndroidManifest.xml

@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.android.rs.jnimultiplersfiles">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="JNIMultipleRSFiles"
+                 android:hardwareAccelerated="true">
+
+        <activity android:name="MainActivity">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+</manifest>

diff --git a/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/Android.mk b/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/Android.mk
new file mode 100644
index 0000000..b3c335f
--- /dev/null
+++ b/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/Android.mk

@@ -0,0 +1,13 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libjnimultiplersfiles
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := jnimultiplersfiles.cpp first.rscript second.rscript
+
+LOCAL_RENDERSCRIPT_FLAGS := -g
+
+include frameworks/rs/tests/lldb/jni/common.mk
+include $(BUILD_SHARED_LIBRARY)

diff --git a/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/first.rscript b/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/first.rscript
new file mode 100644
index 0000000..7c4a852
--- /dev/null
+++ b/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/first.rscript

@@ -0,0 +1,32 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.jnimultiplersfiles)
+
+float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
+
+/* RenderScript kernel that just sets the colour of the screen and does some
+ * simple operations so it is not completely empty
+ * (and can therefore be debugged).
+ */
+uchar4 __attribute__((kernel)) first_kernel(uchar4 in)
+{
+    float4 temp = rsUnpackColor8888(in);
+    temp = gColor;
+    uchar4 result = rsPackColorTo8888(temp);
+    return result;
+}

diff --git a/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/jnimultiplersfiles.cpp b/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/jnimultiplersfiles.cpp
new file mode 100644
index 0000000..6d28a4b
--- /dev/null
+++ b/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/jnimultiplersfiles.cpp

@@ -0,0 +1,60 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <memory>
+
+#include <jni.h>
+#include <RenderScript.h>
+
+#include "ScriptC_first.h"
+#include "ScriptC_second.h"
+
+extern "C" void JNICALL
+Java_com_android_rs_jnimultiplersfiles_MainActivity_nativeRS(
+    JNIEnv * env,
+    jclass,
+    jstring pathObj)
+{
+    static const int size = 64;
+    sp<RS> rs = new RS();
+
+    const char * path = env->GetStringUTFChars(pathObj, nullptr);
+    rs->init(path, RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
+    env->ReleaseStringUTFChars(pathObj, path);
+
+    auto e = Element::RGBA_8888(rs);
+    Type::Builder tb(rs, e);
+    tb.setX(size);
+    tb.setY(size);
+    auto t = tb.create();
+
+    auto a = Allocation::createTyped(rs, t);
+    auto b = Allocation::createTyped(rs, t);
+
+    // Script is executed once, then the data is copied back when finished
+    sp<ScriptC_first> s1 = new ScriptC_first(rs);
+    sp<ScriptC_second> s2 = new ScriptC_second(rs);
+
+    s1->forEach_first_kernel(a, b);
+    uint32_t * output = new uint32_t[size*size];
+    b->copy2DRangeTo(0, 0, size, size, output);
+    delete [] output;
+
+    s2->forEach_second_kernel(a, b);
+
+    rs->finish();
+}
+

diff --git a/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/second.rscript b/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/second.rscript
new file mode 100644
index 0000000..ab898f9
--- /dev/null
+++ b/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/second.rscript

@@ -0,0 +1,25 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.jnimultiplersfiles)
+
+/* RenderScript kernel that just returns the swizzled input. */
+uchar4 __attribute__((kernel)) second_kernel(uchar4 in)
+{
+    uchar4 result = in.wzyx;
+    return result;
+}

diff --git a/tests/lldb/jni/MultipleRSFiles/res/layout/main_layout.xml b/tests/lldb/jni/MultipleRSFiles/res/layout/main_layout.xml
new file mode 100644
index 0000000..131c3b5
--- /dev/null
+++ b/tests/lldb/jni/MultipleRSFiles/res/layout/main_layout.xml

@@ -0,0 +1,15 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>
+

diff --git a/tests/lldb/jni/MultipleRSFiles/src/com/android/rs/jnimultiplersfiles/MainActivity.java b/tests/lldb/jni/MultipleRSFiles/src/com/android/rs/jnimultiplersfiles/MainActivity.java
new file mode 100644
index 0000000..ea743a6
--- /dev/null
+++ b/tests/lldb/jni/MultipleRSFiles/src/com/android/rs/jnimultiplersfiles/MainActivity.java

@@ -0,0 +1,43 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.jnimultiplersfiles;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.graphics.BitmapFactory;
+import android.graphics.Bitmap;
+import android.widget.ImageView;
+
+public class MainActivity extends Activity {
+    private Bitmap mBitmapIn;
+    private Bitmap mBitmapOut;
+
+    static {
+        System.loadLibrary("RS");
+        System.loadLibrary("jnimultiplersfiles");
+    }
+
+    native void nativeRS(String cacheDir);
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.main_layout);
+        nativeRS(this.getCacheDir().toString());
+    }
+}
+

diff --git a/tests/lldb/jni/NoDebugWaitAttach/Android.mk b/tests/lldb/jni/NoDebugWaitAttach/Android.mk
new file mode 100644
index 0000000..887b199
--- /dev/null
+++ b/tests/lldb/jni/NoDebugWaitAttach/Android.mk

@@ -0,0 +1,19 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_SRC_FILES := $(call all-java-files-under, src) \
+                   $(call all-renderscript-files-under, src)
+
+LOCAL_PACKAGE_NAME := JNINoDebugWaitAttach
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_SDK_VERSION := current
+
+LOCAL_JNI_SHARED_LIBRARIES := libjninodebugwaitattach
+
+LOCAL_RENDERSCRIPT_FLAGS := -O0 -target-api 0
+
+include $(BUILD_PACKAGE)
+include $(LOCAL_PATH)/jninodebugwaitattach/Android.mk

diff --git a/tests/lldb/jni/NoDebugWaitAttach/AndroidManifest.xml b/tests/lldb/jni/NoDebugWaitAttach/AndroidManifest.xml
new file mode 100644
index 0000000..846eddd
--- /dev/null
+++ b/tests/lldb/jni/NoDebugWaitAttach/AndroidManifest.xml

@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.android.rs.jninodebugwaitattach">
+    <uses-sdk android:minSdkVersion="21" />
+    <application android:label="JNINoDebugWaitAttach"
+                 android:hardwareAccelerated="true">
+
+        <activity android:name="MainActivity">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+</manifest>

diff --git a/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/Android.mk b/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/Android.mk
new file mode 100644
index 0000000..19b0807
--- /dev/null
+++ b/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/Android.mk

@@ -0,0 +1,11 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libjninodebugwaitattach
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
+LOCAL_LICENSE_CONDITIONS := notice
+
+LOCAL_SRC_FILES := jninodebugwaitattach.cpp simple.rscript
+
+include frameworks/rs/tests/lldb/jni/common.mk
+include $(BUILD_SHARED_LIBRARY)

diff --git a/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/jninodebugwaitattach.cpp b/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/jninodebugwaitattach.cpp
new file mode 100644
index 0000000..72ec36e
--- /dev/null
+++ b/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/jninodebugwaitattach.cpp

@@ -0,0 +1,55 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <memory>
+
+#include <jni.h>
+#include <RenderScript.h>
+
+#include "ScriptC_simple.h"
+
+extern "C" void JNICALL
+Java_com_android_rs_jninodebugwaitattach_MainActivity_nativeRS(
+    JNIEnv * env,
+    jclass,
+    jstring pathObj)
+{
+    static const int size = 8;
+    sp<RS> rs = new RS();
+
+    const char * path = env->GetStringUTFChars(pathObj, nullptr);
+    rs->init(path, RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
+    env->ReleaseStringUTFChars(pathObj, path);
+
+    auto e = Element::RGBA_8888(rs);
+    Type::Builder tb(rs, e);
+    tb.setX(size);
+    tb.setY(size);
+    auto t = tb.create();
+
+    auto a = Allocation::createTyped(rs, t);
+    auto b = Allocation::createTyped(rs, t);
+
+    // Script is executed once, then the data is copied back when finished
+    sp<ScriptC_simple> s = new ScriptC_simple(rs);
+    s->forEach_simple_kernel(a, b);
+    uint32_t * output = new uint32_t[size*size];
+    b->copy2DRangeTo(0, 0, size, size, output);
+    delete [] output;
+
+    rs->finish();
+}
+

diff --git a/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/simple.rscript b/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/simple.rscript
new file mode 100644
index 0000000..c55e0b5
--- /dev/null
+++ b/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/simple.rscript

@@ -0,0 +1,32 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma version(1)
+#pragma rs java_package_name(com.android.rs.jninodebugwaitattach)
+
+float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
+
+/* RenderScript kernel that just sets the colour of the screen and does some
+ * simple operations so it is not completely empty
+ * (and can therefore be debugged).
+ */
+uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
+{
+    float4 temp = rsUnpackColor8888(in);
+    temp = gColor;
+    uchar4 result = rsPackColorTo8888(temp);
+    return result;
+}
\ No newline at end of file

diff --git a/tests/lldb/jni/NoDebugWaitAttach/res/layout/main_layout.xml b/tests/lldb/jni/NoDebugWaitAttach/res/layout/main_layout.xml
new file mode 100644
index 0000000..131c3b5
--- /dev/null
+++ b/tests/lldb/jni/NoDebugWaitAttach/res/layout/main_layout.xml

@@ -0,0 +1,15 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#0099cc"
+    tools:context=".MainActivity">
+
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="fitCenter" />
+
+</FrameLayout>
+

diff --git a/tests/lldb/jni/NoDebugWaitAttach/src/com/android/rs/jninodebugwaitattach/MainActivity.java b/tests/lldb/jni/NoDebugWaitAttach/src/com/android/rs/jninodebugwaitattach/MainActivity.java
new file mode 100644
index 0000000..937e4d0
--- /dev/null
+++ b/tests/lldb/jni/NoDebugWaitAttach/src/com/android/rs/jninodebugwaitattach/MainActivity.java

@@ -0,0 +1,43 @@
+/*
+* Copyright (C) 2016 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.android.rs.jninodebugwaitattach;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.graphics.BitmapFactory;
+import android.graphics.Bitmap;
+import android.widget.ImageView;
+
+public class MainActivity extends Activity {
+    private Bitmap mBitmapIn;
+    private Bitmap mBitmapOut;
+
+    static {
+        System.loadLibrary("RS");
+        System.loadLibrary("jninodebugwaitattach");
+    }
+
+    native void nativeRS(String cacheDir);
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.main_layout);
+        nativeRS(this.getCacheDir().toString());
+    }
+}
+

diff --git a/tests/lldb/jni/common.mk b/tests/lldb/jni/common.mk
new file mode 100644
index 0000000..7441307
--- /dev/null
+++ b/tests/lldb/jni/common.mk

@@ -0,0 +1,13 @@
+LOCAL_MODULE_TAGS := tests
+
+LOCAL_CPP_FEATURES += exceptions
+
+LOCAL_CFLAGS := -Werror -Wall -Wextra -std=c++11
+LOCAL_RENDERSCRIPT_FLAGS += -O0 -target-api 0
+
+LOCAL_HEADER_LIBRARIES := jni_headers
+LOCAL_SHARED_LIBRARIES += libdl liblog
+LOCAL_STATIC_LIBRARIES += libRScpp_static
+
+LOCAL_SDK_VERSION := 23
+LOCAL_NDK_STL_VARIANT := c++_static

diff --git a/tests/lldb/run_tests.py b/tests/lldb/run_tests.py
new file mode 100755
index 0000000..d1700bb
--- /dev/null
+++ b/tests/lldb/run_tests.py

@@ -0,0 +1,839 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Main test suite execution script.'''
+import argparse
+import inspect
+import logging
+import os
+import signal
+import subprocess
+import sys
+import time
+import collections
+import xml.etree.ElementTree as ET
+
+from config import Config
+from tests.harness import util_constants
+from tests.harness.exception import TestSuiteException, FailFastException
+from tests.harness import UtilAndroid
+from tests.harness import UtilBundle
+from tests.harness import util_log
+from tests.harness.util_functions import load_py_module
+from tests.harness.decorators import deprecated
+
+# For some reason pylint is not able to understand the class returned by
+# from util_log.get_logger() and generates a lot of false warnings
+#pylint: disable=maybe-no-member
+
+EMU_PROC = None
+
+def _parse_args():
+    '''Parse the command line arguments.
+
+    Returns:
+        A namespace object that contains the options specified to run_tests on
+        the command line.
+    '''
+
+    parser = argparse.ArgumentParser(description='Run the test suite.')
+
+    parser.add_argument('--config', '-c',
+                        metavar='path',
+                        help='Path to a custom config file.')
+    parser.add_argument('--device', '-d',
+                        help='Specify the device id of the device to test on.')
+    parser.add_argument('--test', '-t',
+                        metavar='path',
+                        help='Specify a specific test to run.')
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('--wimpy', '-w',
+                        action='store_true',
+                        default=None,
+                        help='Test only a core subset of features.')
+    group.add_argument('--app-types',
+                        default=['java', 'cpp', 'jni'],
+                        nargs='*',
+                        help='Specify a list of Android app types against which'
+                             ' to run the tests',
+                        dest='bundle_types')
+    parser.add_argument('--install-only',
+                        action='store_true',
+                        default=False,
+                        help='It only runs the pre-run stage of the test suite.'
+                             ' It installs the required APKs but does not '
+                             'execute the tests.',
+                        dest='install_only')
+    parser.add_argument('--no-install', '-n',
+                        action='store_true',
+                        default=False,
+                        help='Stop the test suite installing apks to device.',
+                        dest='noinstall')
+    parser.add_argument('--no-uninstall',
+                        action='store_true',
+                        default=False,
+                        help='Stop the test suite uninstalling apks after '
+                             'completion.',
+                        dest='nouninstall')
+    parser.add_argument('--print-to-stdout',
+                        action='store_true',
+                        default=False,
+                        help='Print all logging information to standard out.',
+                        dest='print_to_stdout')
+    parser.add_argument('--verbose', '-v',
+                        action='store_true',
+                        default=None,
+                        help='Store extra info in the log.')
+    parser.add_argument('--fail-fast',
+                        action='store_true',
+                        default=False,
+                        help='Exit the test suite immediately on the first failure.')
+    parser.add_argument('--run-emu',
+                        action='store_true',
+                        default=None,
+                        help='Spawn an emulator and run the test suite on that.'
+                             ' Specify the emulator command line in the config'
+                             ' file or with -emu-cmd.',
+                        dest='run_emu')
+
+    # Get the properties of the Config class and add a command line argument
+    # for each.
+    this_module = sys.modules[__name__]
+    for member_name, member_obj in inspect.getmembers(Config):
+        if (inspect.isdatadescriptor(member_obj) and
+            member_name not in ['__weakref__', 'device', 'verbose']):
+
+            # List type properties can take one or more arguments
+            num_args = None
+            if (isinstance(member_obj, property)
+                and isinstance(member_obj.fget(Config), list)):
+                num_args = '+'
+
+            opt_name = member_name.replace('_', '-')
+
+            setattr(this_module, opt_name, '')
+
+            parser.add_argument('--' + opt_name,
+                                nargs=num_args,
+                                help=member_obj.__doc__,
+                                dest=member_name)
+
+    return parser.parse_args()
+
+
+def _choice(first_choice, second_choice):
+    '''Return first_choice if it is not None otherwise return second_choice.
+
+    Args:
+        first_choice: The first choice value.
+        second_choice: The alternative value.
+
+    Returns:
+        The first argument if it is not None, and the second otherwise.
+    '''
+    return first_choice if first_choice else second_choice
+
+
+class State(object):
+    '''This class manages all objects required by the test suite.'''
+
+    # pylint: disable=too-many-instance-attributes
+    # Since this is a state class many attributes are expected.
+
+    def __init__(self):
+        '''State constructor.
+
+        Raises:
+            TestSuiteException: When unable to load config file.
+
+            AssertionError: When assertions fail.
+        '''
+
+        # Parse the command line options
+        args = _parse_args()
+
+        # create a config instance
+        if args.config:
+            # use the user supplied
+            config = State.load_user_configuration(args.config)
+        else:
+            # use the default configuration
+            config = Config()
+
+        # save the test denylist
+        self.blocklist = _choice(args.blocklist, config.blocklist)
+
+        # Allow any of the command line arguments to override the
+        # values in the config file.
+        self.adb_path = _choice(args.adb_path, config.adb_path)
+
+        self.host_port = int(_choice(args.host_port, config.host_port))
+
+        self.device = _choice(args.device, config.device)
+
+        self.user_specified_device = self.device
+
+        self.device_port = int(_choice(args.device_port, config.device_port))
+
+        self.lldb_server_path_device = _choice(args.lldb_server_path_device,
+                                               config.lldb_server_path_device)
+
+        self.lldb_server_path_host = _choice(args.lldb_server_path_host,
+                                             config.lldb_server_path_host)
+
+        self.aosp_product_path = _choice(args.aosp_product_path,
+                                         config.aosp_product_path)
+
+        self.log_file_path = _choice(args.log_file_path, config.log_file_path)
+
+        self.results_file_path = _choice(args.results_file_path,
+                                         config.results_file_path)
+
+        self.lldb_path = _choice(args.lldb_path, config.lldb_path)
+        self.print_to_stdout = args.print_to_stdout
+        self.verbose = _choice(args.verbose, config.verbose)
+        self.timeout = int(_choice(args.timeout, config.timeout))
+        self.emu_cmd = _choice(args.emu_cmd, config.emu_cmd)
+        self.run_emu = args.run_emu
+        self.wimpy = args.wimpy
+        self.bundle_types = args.bundle_types if not self.wimpy else ['java']
+        self.fail_fast = args.fail_fast
+
+        # validate the param "verbose"
+        if not isinstance(self.verbose, bool):
+            raise TestSuiteException('The parameter "verbose" should be a '
+                                     'boolean: {0}'.format(self.verbose))
+
+        # create result array
+        self.results = dict()
+        self.single_test = args.test
+
+        # initialise the logging facility
+        log_level = logging.INFO if not self.verbose else logging.DEBUG
+        util_log.initialise("driver",
+                            print_to_stdout=self.print_to_stdout,
+                            level=log_level,
+                            file_mode='w', # open for write
+                            file_path=self.log_file_path
+                            )
+        log = util_log.get_logger()
+
+        if self.run_emu and not self.emu_cmd:
+            log.TestSuiteException('Need to specify --emu-cmd (or specify a'
+                ' value in the config file) if using --run-emu.')
+
+        # create a results file
+        self.results_file = open(self.results_file_path, 'w')
+
+        # create an android helper object
+        self.android = UtilAndroid(self.adb_path,
+                                   self.lldb_server_path_device,
+                                   self.device)
+        assert self.android
+
+        # create a test bundle
+        self.bundle = UtilBundle(self.android,
+                                 self.aosp_product_path)
+        assert self.bundle
+
+        # save the no pushing option
+        assert isinstance(args.noinstall, bool)
+        self.noinstall = args.noinstall
+
+        assert isinstance(args.nouninstall, bool)
+        self.nouninstall = args.nouninstall
+
+        # install only option
+        assert type(args.install_only) is bool
+        self.install_only = args.install_only
+        if self.install_only:
+            log.log_and_print('Option --install-only set. The test APKs will '
+                              'be installed on the device but the tests will '
+                              'not be executed.')
+            if self.noinstall:
+                raise TestSuiteException('Conflicting options given: '
+                                         '--install-only and --no-install')
+
+        # TCP port modifier which is used to increment the port number used for
+        # each test case to avoid collisions.
+        self.port_mod = 0
+
+        # total number of test files that have been executed
+        self.test_count = 0
+
+    def get_android(self):
+        '''Return the android ADB helper instance.
+
+        Returns:
+            The android ADB helper, instance of UtilAndroid.
+        '''
+        assert self.android
+        return self.android
+
+    def get_bundle(self):
+        '''Return the test executable bundle.
+
+        Returns:
+            The test exectable collection, instance of UtilBundle.
+        '''
+        return self.bundle
+
+    def add_result(self, name, app_type, result):
+        '''Add a test result to the collection.
+
+        Args:
+            name: String name of the test that has executed.
+            app_type: type of app i.e. java, jni, or cpp
+            result: String result of the test, "pass", "fail", "error".
+        '''
+        key = (name, app_type)
+        assert key not in self.results
+        self.results[key] = result
+
+    def get_single_test(self):
+        '''Get the name of the single test to run.
+
+        Returns:
+            A string that is the name of the python file containing the test to
+            be run. If all tests are to be run this returns None.
+        '''
+        return self.single_test
+
+    @staticmethod
+    def load_user_configuration(path):
+        '''Load the test suite config from the give path.
+
+        Instantiate the Config class found in the module at the given path.
+        If no suitable class is available, it raises a TestSuiteException.
+
+        Args:
+            path: String location of the module.
+
+        Returns:
+            an instance of the Config class, defined in the module.
+
+        Raises:
+            TestSuiteException: when unable to import the module or when a
+                                subclass of Config is not found inside it.
+        '''
+
+        # load the module
+        config_module = load_py_module(path)
+        if not config_module:
+            raise TestSuiteException('Unable to import the module from "%s"'
+                                     % (path))
+
+        # look for a subclass of Config
+        for name, value in inspect.getmembers(config_module):
+            if (inspect.isclass(value)
+                and name != 'Config'
+                and issubclass(value, Config)):
+                # that's our candidate
+                return value()
+
+        # otherwise there are no valid candidates
+        raise TestSuiteException('The provided user configuration is not '
+                                 'valid. The module must define a subclass '
+                                 'of Config')
+
+
+def _kill_emulator():
+    ''' Kill the emulator process. '''
+    global EMU_PROC
+    if EMU_PROC:
+        try:
+            EMU_PROC.terminate()
+        except OSError:
+            # can't kill a dead proc
+            log = util_log.get_logger()
+            log.debug('Trying to kill an emulator but it is already dead.')
+
+
+def _check_emulator_terminated():
+    ''' Throw an exception if the emulator process has ended.
+
+    Raises:
+        TestSuiteException: If the emulator process has ended.
+    '''
+    global EMU_PROC
+    assert EMU_PROC
+    if EMU_PROC.poll():
+        stdout, stderr = EMU_PROC.communicate()
+        raise TestSuiteException('The emulator terminated with output:'
+            '\nstderr: {0}\nstdout: {1}.'.format(stderr, stdout))
+
+
+@deprecated()
+def _launch_emulator(state):
+    '''Launch the emulator and wait for it to boot.
+
+    Args:
+        emu_cmd: The command line to run the emulator.
+
+    Raises:
+        TestSuiteException: If an emulator already exists or the emulator
+                            process terminated before we could connect to it, or
+                            we failed to copy lldb-server to the emulator.
+    '''
+    global EMU_PROC
+    android = state.android
+    if state.user_specified_device:
+        if android.device_with_substring_exists(state.user_specified_device):
+            raise TestSuiteException(
+                'A device with name {0} already exists.',
+                state.user_specified_device)
+    else:
+        if android.device_with_substring_exists('emulator'):
+            raise TestSuiteException('An emulator already exists.')
+
+    assert state.emu_cmd
+    EMU_PROC = subprocess.Popen(state.emu_cmd.split(),
+                                stdout=None,
+                                stderr=subprocess.STDOUT)
+
+    log = util_log.get_logger()
+    log.info('Launching emulator with command line {0}'.format(state.emu_cmd))
+
+    tries_number = 180
+    tries = tries_number
+    found_device = False
+    while not found_device:
+        try:
+            android.validate_device(False, 'emulator')
+            found_device = True
+        except TestSuiteException as ex:
+            tries -= 1
+            if tries == 0:
+                # Avoid infinitely looping if the emulator won't boot
+                log.warning(
+                    'Giving up trying to validate device after {0} tries.'
+                    .format(tries_number))
+                raise ex
+            _check_emulator_terminated()
+            # wait a bit and try again, maybe it has now booted
+            time.sleep(10)
+
+    tries = 500
+    while not android.is_booted():
+        tries -= 1
+        if tries == 0:
+            # Avoid infinitely looping if the emulator won't boot
+            raise TestSuiteException('The emulator has failed to boot.')
+        _check_emulator_terminated()
+        time.sleep(5)
+
+    # Need to be root before we can push lldb-server
+    android.adb_root()
+    android.wait_for_device()
+
+    # Push the lldb-server executable to the device.
+    output = android.adb('push {0} {1}'.format(state.lldb_server_path_host,
+                                               state.lldb_server_path_device))
+
+    if 'failed to copy' in output or 'No such file or directory' in output:
+        raise TestSuiteException(
+            'unable to push lldb-server to the emulator: {0}.'
+            .format(output))
+
+    output = android.shell('chmod a+x {0}'
+                           .format(state.lldb_server_path_device))
+
+    if 'No such file or directory' in output:
+        raise TestSuiteException('Failed to copy lldb-server to the emulator.')
+
+
+def _restart_emulator(state):
+    '''Kill the emulator and start a new instance.
+
+    Args:
+        state: Test suite state collection, instance of State.
+    '''
+    _kill_emulator()
+    _launch_emulator(state)
+
+
+def _run_test(state, name, bundle_type):
+    '''Execute a single test case.
+
+    Args:
+        state: Test suite state collection, instance of State.
+        name: String file name of the test to execute.
+        bundle_type: string for the installed app type (cpp|jni|java)
+
+    Raises:
+        AssertionError: When assertion fails.
+    '''
+    assert isinstance(name, str)
+
+    try:
+        state.android.check_adb_alive()
+    except TestSuiteException as expt:
+        global EMU_PROC
+        if EMU_PROC:
+            _restart_emulator(state)
+        else:
+            raise expt
+
+    log = util_log.get_logger()
+    sys.stdout.write('Running {0}\r'.format(name))
+    sys.stdout.flush()
+    log.info('Running {0}'.format(name))
+
+    run_tests_dir = os.path.dirname(os.path.realpath(__file__))
+    run_test_path = os.path.join(run_tests_dir, 'tests', 'run_test.py')
+
+    # Forward port for lldb-server on the device to our host
+    hport = int(state.host_port) + state.port_mod
+    dport = int(state.device_port) + state.port_mod
+    state.android.forward_port(hport, dport)
+    state.port_mod += 1
+
+    log.debug('Giving up control to {0}...'.format(name))
+
+    params = map(str, [
+        sys.executable,
+        run_test_path,
+        name,
+        state.log_file_path,
+        state.adb_path,
+        state.lldb_server_path_device,
+        state.aosp_product_path,
+        dport,
+        state.android.get_device_id(),
+        state.print_to_stdout,
+        state.verbose,
+        state.wimpy,
+        state.timeout,
+        bundle_type
+    ])
+
+    return_code = subprocess.call(params)
+    state.test_count += 1
+    state.android.remove_port_forwarding()
+    log.seek_to_end()
+
+    # report in sys.stdout the result
+    success = return_code == util_constants.RC_TEST_OK
+    status_handlers = collections.defaultdict(lambda: ('error', log.error), (
+            (util_constants.RC_TEST_OK, ('pass', log.info)),
+            (util_constants.RC_TEST_TIMEOUT, ('timeout', log.error)),
+            (util_constants.RC_TEST_IGNORED, ('ignored', log.info)),
+            (util_constants.RC_TEST_FAIL, ('fail', log.critical))
+        )
+    )
+    status_name, status_logger = status_handlers[return_code]
+    log.info('Running %s: %s', name, status_name.upper())
+    status_logger("Test %r: %s", name, status_name)
+
+    # Special case for ignored tests - just return now
+    if return_code == util_constants.RC_TEST_IGNORED:
+        return
+
+    state.add_result(name, bundle_type, status_name)
+
+    if state.fail_fast and not success:
+        raise FailFastException(name)
+
+    # print a running total pass rate
+    passes = sum(1 for key, value in state.results.items() if value == 'pass')
+    log.info('Current pass rate: %s of %s executed.', passes, len(state.results))
+
+
+def _check_lldbserver_exists(state):
+    '''Check lldb-server exists on the target device and it is executable.
+
+    Raises:
+        TestSuiteError: If lldb-server does not exist on the target.
+    '''
+    assert state
+
+    message = 'Unable to verify valid lldb-server on target'
+
+    android = state.get_android()
+    assert android
+
+    cmd = state.lldb_server_path_device
+    out = android.shell(cmd, False)
+    if not isinstance(out, str):
+        raise TestSuiteException(message)
+    if out.find('Usage:') < 0:
+        raise TestSuiteException(message)
+
+
+def _suite_pre_run(state):
+    '''This function is executed before the test cases are run (setup).
+
+    Args:
+        state: Test suite state collection, instance of State.
+
+    Return:
+        True if the pre_run step completes without error.
+        Checks made:
+            - Validating that adb exists and runs.
+            - Validating that a device is attached.
+            - We have root access to the device.
+            - All test binaries were pushed to the device.
+            - The port for lldb-server was forwarded correctly.
+
+    Raises:
+        AssertionError: When assertions fail.
+    '''
+    assert state
+    log = util_log.get_logger()
+
+    try:
+        android = state.get_android()
+        bundle = state.get_bundle()
+        assert android
+        assert bundle
+
+        # validate ADB helper class
+        android.validate_adb()
+        log.log_and_print('Located ADB')
+
+        if state.run_emu:
+            log.log_and_print('Launching emulator...')
+            _launch_emulator(state)
+            log.log_and_print('Started emulator ' + android.device)
+        else:
+            android.validate_device()
+            log.log_and_print('Located device ' + android.device)
+
+        if state.noinstall and not state.single_test:
+            bundle.check_apps_installed(state.wimpy)
+
+        # elevate to root user
+        android.adb_root()
+        android.wait_for_device()
+        # check that lldb-server exists on device
+        android.kill_servers()
+        _check_lldbserver_exists(state)
+
+        if not state.noinstall:
+            # push all tests to the device
+            log.log_and_print('Pushing all tests...')
+            bundle.push_all()
+            log.log_and_print('Pushed all tests')
+        log.log_and_print('Pre run complete')
+
+    except TestSuiteException as expt:
+        log.exception('Test suite pre run failure')
+
+        # Even if we are logging the error, it may be helpful and more
+        # immediate to find out the error into the terminal
+        log.log_and_print('ERROR: Unable to set up the test suite: %s\n'
+                          % expt.message, logging.ERROR)
+
+        return False
+    return True
+
+
+def _suite_post_run(state):
+    '''This function is executed after the test cases have run (teardown).
+
+    Args:
+        state: Test suite state collection, instance of State.
+    Returns:
+        Number of failures
+    '''
+    log = util_log.get_logger()
+
+    if not state.noinstall and not state.nouninstall:
+        if state.wimpy:
+            state.bundle.uninstall_all_apk()
+        else:
+            state.bundle.uninstall_all()
+        log.log_and_print('Uninstalled/Deleted all tests')
+
+    total = 0
+    passes = 0
+    failures = 0
+
+    results = ET.Element('testsuite')
+    results.attrib['name'] = 'LLDB RS Test Suite'
+
+    for key, value in state.results.items():
+        total += 1
+        if value == 'pass':
+            passes += 1
+        else:
+            failures += 1
+
+        # test case name, followed by pass, failure or error elements
+        testcase = ET.Element('testcase')
+        testcase.attrib['name'] = "%s:%s" % key
+        result_element = ET.Element(value)
+        result_element.text = "%s:%s" % key
+        testcase.append(result_element)
+        results.append(testcase)
+
+    assert passes + failures == total, 'Invalid test results status'
+    if failures:
+        log.log_and_print(
+            'The following failures occurred:\n%s\n' %
+            '\n'.join('failed: %s:%s' % test_spec
+                for test_spec, result in state.results.items() if result != 'pass'
+        ))
+
+    log.log_and_print('{0} of {1} passed'.format(passes, total))
+    if total:
+        log.log_and_print('{0}% rate'.format((passes*100)/total))
+
+    results.attrib['tests'] = str(total)
+    state.results_file.write(ET.tostring(results, encoding='iso-8859-1'))
+
+    return failures
+
+
+def _discover_tests(state):
+    '''Discover all tests in the tests directory.
+
+    Returns:
+        List of strings, test file names from the 'tests' directory.
+    '''
+    tests = []
+
+    single_test = state.get_single_test()
+    if single_test is None:
+        file_dir = os.path.dirname(os.path.realpath(__file__))
+        tests_dir = os.path.join(file_dir, 'tests')
+
+        for sub_dir in os.listdir(tests_dir):
+            current_test_dir = os.path.join(tests_dir, sub_dir)
+            if os.path.isdir(current_test_dir):
+                dir_name = os.path.basename(current_test_dir)
+
+                if dir_name == 'harness':
+                    continue
+
+                for item in os.listdir(current_test_dir):
+                    if (item.startswith('test')
+                        and item.endswith('.py')
+                        and not item in state.blocklist):
+                        tests.append(item)
+    else:
+        if single_test.endswith('.py'):
+            tests.append(single_test)
+        else:
+            tests.append(single_test + '.py')
+
+    return tests
+
+
+def _deduce_python_path(state):
+    '''Try to deduce the PYTHONPATH environment variable via the LLDB binary.
+
+    Args:
+        state: Test suite state collection, instance of State.
+
+    Returns:
+        True if PYTHONPATH has been updated, False otherwise.
+
+    Raises:
+        TestSuiteException: If lldb path provided in the config or command line
+                            is incorrect.
+        AssertionError: If an assertion fails.
+    '''
+
+    lldb_path = state.lldb_path
+    if not lldb_path:
+        # lldb may not be provided in preference of a manual $PYTHONPATH
+        return False
+
+    params = [lldb_path, '-P']
+
+    try:
+        proc = subprocess.Popen(params, stdout=subprocess.PIPE)
+    except OSError as err:
+        error_string = 'Could not run lldb at %s: %s' % (lldb_path, str(err))
+        raise TestSuiteException(error_string)
+
+    stdout = proc.communicate()[0]
+    if stdout:
+        os.environ['PYTHONPATH'] = stdout.strip()
+        return True
+
+    return False
+
+
+def main():
+    '''The lldb-renderscript test suite entry point.'''
+    log = None
+
+    try:
+        # parse the command line
+        state = State()
+        assert state
+
+        # logging is initialised in State()
+        log = util_log.get_logger()
+
+        # if we can, set PYTHONPATH for lldb bindings
+        if not _deduce_python_path(state):
+            log.log_and_print('Unable to deduce PYTHONPATH', logging.WARN)
+
+        # pre run step
+        if not _suite_pre_run(state):
+            raise TestSuiteException('Test suite pre-run step failed')
+        # discover all tests and execute them
+        tests = _discover_tests(state)
+        log.log_and_print('Found {0} tests'.format(len(tests)))
+        if state.install_only:
+            log.log_and_print('Test applications installed. Terminating due to '
+                              '--install-only option')
+        else:
+            # run the tests
+            for bundle_type in state.bundle_types:
+                log.info("Running bundle type '%s'", bundle_type)
+                for item in tests:
+                    _run_test(state, item, bundle_type)
+                # post run step
+            quit(0 if _suite_post_run(state) == 0 else 1)
+
+    except AssertionError:
+        if log:
+            log.exception('Internal test suite error')
+
+        print('Internal test suite error')
+        quit(1)
+
+    except FailFastException:
+        log.exception('Early exit after first test failure')
+        quit(1)
+
+    except TestSuiteException as error:
+        if log:
+            log.exception('Test suite exception')
+
+        print('{0}'.format(str(error)))
+        quit(2)
+
+    finally:
+        _kill_emulator()
+        logging.shutdown()
+
+def signal_handler(_, _unused):
+    '''Signal handler for SIGINT, caused by the user typing Ctrl-C.'''
+    # pylint: disable=unused-argument
+    # pylint: disable=protected-access
+    print('Ctrl+C!')
+    os._exit(1)
+
+
+# execution trampoline
+if __name__ == '__main__':
+    signal.signal(signal.SIGINT, signal_handler)
+    main()

diff --git a/tests/lldb/tests/__init__.py b/tests/lldb/tests/__init__.py
new file mode 100644
index 0000000..d91549f
--- /dev/null
+++ b/tests/lldb/tests/__init__.py

@@ -0,0 +1,19 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''This module contains a test runner, tests and utility code'''
+
+
+
+from . import harness

diff --git a/tests/lldb/tests/harness/RS_funs.py b/tests/lldb/tests/harness/RS_funs.py
new file mode 100644
index 0000000..a5a0539
--- /dev/null
+++ b/tests/lldb/tests/harness/RS_funs.py

@@ -0,0 +1,1401 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''A list of signatures for RS builtin functions and util functions for them.
+
+from __future__ import absolute_import
+
+Function signature syntax is usually C-like, however, fixed values can also be
+specified for those functions where the input range is restricted.
+Lines in the function table beginning with - are comments.
+Also contains utility functions to build an LLDB expression from a single
+function line.
+'''
+
+import re
+
+# Remove blank and comment lines using a lambda.
+FUNC_LIST = filter(lambda line: line.strip()
+                                and not line.strip().startswith('-'), '''
+- math functions
+
+uchar abs(char v);
+uchar2 abs(char2 v);
+uchar3 abs(char3 v);
+uchar4 abs(char4 v);
+uint abs(int v);
+uint2 abs(int2 v);
+uint3 abs(int3 v);
+uint4 abs(int4 v);
+ushort abs(short v);
+ushort2 abs(short2 v);
+ushort3 abs(short3 v);
+ushort4 abs(short4 v);
+
+float acos(float v);
+float2 acos(float2 v);
+float3 acos(float3 v);
+float4 acos(float4 v);
+
+float acosh(float v);
+float2 acosh(float2 v);
+float3 acosh(float3 v);
+float4 acosh(float4 v);
+
+float acospi(float v);
+float2 acospi(float2 v);
+float3 acospi(float3 v);
+float4 acospi(float4 v);
+
+float asin(float v);
+float2 asin(float2 v);
+float3 asin(float3 v);
+float4 asin(float4 v);
+
+float asinh(float v);
+float2 asinh(float2 v);
+float3 asinh(float3 v);
+float4 asinh(float4 v);
+
+float asinpi(float v);
+float2 asinpi(float2 v);
+float3 asinpi(float3 v);
+float4 asinpi(float4 v);
+
+float atan(float v);
+float2 atan(float2 v);
+float3 atan(float3 v);
+float4 atan(float4 v);
+
+float atan2(float numerator, float denominator);
+float2 atan2(float2 numerator, float2 denominator);
+float3 atan2(float3 numerator, float3 denominator);
+float4 atan2(float4 numerator, float4 denominator);
+
+float atan2pi(float numerator, float denominator);
+float2 atan2pi(float2 numerator, float2 denominator);
+float3 atan2pi(float3 numerator, float3 denominator);
+float4 atan2pi(float4 numerator, float4 denominator);
+
+float atanh(float v);
+float2 atanh(float2 v);
+float3 atanh(float3 v);
+float4 atanh(float4 v);
+
+float atanpi(float v);
+float2 atanpi(float2 v);
+float3 atanpi(float3 v);
+float4 atanpi(float4 v);
+
+float cbrt(float v);
+float2 cbrt(float2 v);
+float3 cbrt(float3 v);
+float4 cbrt(float4 v);
+
+float ceil(float v);
+float2 ceil(float2 v);
+float3 ceil(float3 v);
+float4 ceil(float4 v);
+
+char clamp(char value, char min_value, char max_value);
+char2 clamp(char2 value, char min_value, char max_value);
+char2 clamp(char2 value, char2 min_value, char2 max_value);
+char3 clamp(char3 value, char min_value, char max_value);
+char3 clamp(char3 value, char3 min_value, char3 max_value);
+char4 clamp(char4 value, char min_value, char max_value);
+char4 clamp(char4 value, char4 min_value, char4 max_value);
+float clamp(float value, float min_value, float max_value);
+float2 clamp(float2 value, float min_value, float max_value);
+float2 clamp(float2 value, float2 min_value, float2 max_value);
+float3 clamp(float3 value, float min_value, float max_value);
+float3 clamp(float3 value, float3 min_value, float3 max_value);
+float4 clamp(float4 value, float min_value, float max_value);
+float4 clamp(float4 value, float4 min_value, float4 max_value);
+int clamp(int value, int min_value, int max_value);
+int2 clamp(int2 value, int min_value, int max_value);
+int2 clamp(int2 value, int2 min_value, int2 max_value);
+int3 clamp(int3 value, int min_value, int max_value);
+int3 clamp(int3 value, int3 min_value, int3 max_value);
+int4 clamp(int4 value, int min_value, int max_value);
+int4 clamp(int4 value, int4 min_value, int4 max_value);
+long clamp(long value, long min_value, long max_value);
+long2 clamp(long2 value, long min_value, long max_value);
+long2 clamp(long2 value, long2 min_value, long2 max_value);
+long3 clamp(long3 value, long min_value, long max_value);
+long3 clamp(long3 value, long3 min_value, long3 max_value);
+long4 clamp(long4 value, long min_value, long max_value);
+long4 clamp(long4 value, long4 min_value, long4 max_value);
+short clamp(short value, short min_value, short max_value);
+short2 clamp(short2 value, short min_value, short max_value);
+short2 clamp(short2 value, short2 min_value, short2 max_value);
+short3 clamp(short3 value, short min_value, short max_value);
+short3 clamp(short3 value, short3 min_value, short3 max_value);
+short4 clamp(short4 value, short min_value, short max_value);
+short4 clamp(short4 value, short4 min_value, short4 max_value);
+uchar clamp(uchar value, uchar min_value, uchar max_value);
+uchar2 clamp(uchar2 value, uchar min_value, uchar max_value);
+uchar2 clamp(uchar2 value, uchar2 min_value, uchar2 max_value);
+uchar3 clamp(uchar3 value, uchar min_value, uchar max_value);
+uchar3 clamp(uchar3 value, uchar3 min_value, uchar3 max_value);
+uchar4 clamp(uchar4 value, uchar min_value, uchar max_value);
+uchar4 clamp(uchar4 value, uchar4 min_value, uchar4 max_value);
+uint clamp(uint value, uint min_value, uint max_value);
+uint2 clamp(uint2 value, uint min_value, uint max_value);
+uint2 clamp(uint2 value, uint2 min_value, uint2 max_value);
+uint3 clamp(uint3 value, uint min_value, uint max_value);
+uint3 clamp(uint3 value, uint3 min_value, uint3 max_value);
+uint4 clamp(uint4 value, uint min_value, uint max_value);
+uint4 clamp(uint4 value, uint4 min_value, uint4 max_value);
+ulong clamp(ulong value, ulong min_value, ulong max_value);
+ulong2 clamp(ulong2 value, ulong min_value, ulong max_value);
+ulong2 clamp(ulong2 value, ulong2 min_value, ulong2 max_value);
+ulong3 clamp(ulong3 value, ulong min_value, ulong max_value);
+ulong3 clamp(ulong3 value, ulong3 min_value, ulong3 max_value);
+ulong4 clamp(ulong4 value, ulong min_value, ulong max_value);
+ulong4 clamp(ulong4 value, ulong4 min_value, ulong4 max_value);
+ushort clamp(ushort value, ushort min_value, ushort max_value);
+ushort2 clamp(ushort2 value, ushort min_value, ushort max_value);
+ushort2 clamp(ushort2 value, ushort2 min_value, ushort2 max_value);
+ushort3 clamp(ushort3 value, ushort min_value, ushort max_value);
+ushort3 clamp(ushort3 value, ushort3 min_value, ushort3 max_value);
+ushort4 clamp(ushort4 value, ushort min_value, ushort max_value);
+ushort4 clamp(ushort4 value, ushort4 min_value, ushort4 max_value);
+
+char clz(char value);
+char2 clz(char2 value);
+char3 clz(char3 value);
+char4 clz(char4 value);
+int clz(int value);
+int2 clz(int2 value);
+int3 clz(int3 value);
+int4 clz(int4 value);
+short clz(short value);
+short2 clz(short2 value);
+short3 clz(short3 value);
+short4 clz(short4 value);
+uchar clz(uchar value);
+uchar2 clz(uchar2 value);
+uchar3 clz(uchar3 value);
+uchar4 clz(uchar4 value);
+uint clz(uint value);
+uint2 clz(uint2 value);
+uint3 clz(uint3 value);
+uint4 clz(uint4 value);
+ushort clz(ushort value);
+ushort2 clz(ushort2 value);
+ushort3 clz(ushort3 value);
+ushort4 clz(ushort4 value);
+
+float copysign(float magnitude_value, float sign_value);
+float2 copysign(float2 magnitude_value, float2 sign_value);
+float3 copysign(float3 magnitude_value, float3 sign_value);
+float4 copysign(float4 magnitude_value, float4 sign_value);
+
+float cos(float v);
+float2 cos(float2 v);
+float3 cos(float3 v);
+float4 cos(float4 v);
+
+float cosh(float v);
+float2 cosh(float2 v);
+float3 cosh(float3 v);
+float4 cosh(float4 v);
+
+float cospi(float v);
+float2 cospi(float2 v);
+float3 cospi(float3 v);
+float4 cospi(float4 v);
+
+float erf(float v);
+float2 erf(float2 v);
+float3 erf(float3 v);
+float4 erf(float4 v);
+
+float erfc(float v);
+float2 erfc(float2 v);
+float3 erfc(float3 v);
+float4 erfc(float4 v);
+
+float exp(float v);
+float2 exp(float2 v);
+float3 exp(float3 v);
+float4 exp(float4 v);
+
+float exp10(float v);
+float2 exp10(float2 v);
+float3 exp10(float3 v);
+float4 exp10(float4 v);
+
+float exp2(float v);
+float2 exp2(float2 v);
+float3 exp2(float3 v);
+float4 exp2(float4 v);
+
+float expm1(float v);
+float2 expm1(float2 v);
+float3 expm1(float3 v);
+float4 expm1(float4 v);
+
+float fabs(float v);
+float2 fabs(float2 v);
+float3 fabs(float3 v);
+float4 fabs(float4 v);
+
+float fdim(float a, float b);
+float2 fdim(float2 a, float2 b);
+float3 fdim(float3 a, float3 b);
+float4 fdim(float4 a, float4 b);
+
+float floor(float v);
+float2 floor(float2 v);
+float3 floor(float3 v);
+float4 floor(float4 v);
+
+float fma(float multiplicand1, float multiplicand2, float offset);
+float2 fma(float2 multiplicand1, float2 multiplicand2, float2 offset);
+float3 fma(float3 multiplicand1, float3 multiplicand2, float3 offset);
+float4 fma(float4 multiplicand1, float4 multiplicand2, float4 offset);
+
+float fmax(float a, float b);
+float2 fmax(float2 a, float b);
+float2 fmax(float2 a, float2 b);
+float3 fmax(float3 a, float b);
+float3 fmax(float3 a, float3 b);
+float4 fmax(float4 a, float b);
+float4 fmax(float4 a, float4 b);
+
+float fmin(float a, float b);
+float2 fmin(float2 a, float b);
+float2 fmin(float2 a, float2 b);
+float3 fmin(float3 a, float b);
+float3 fmin(float3 a, float3 b);
+float4 fmin(float4 a, float b);
+float4 fmin(float4 a, float4 b);
+
+float fmod(float numerator, float denominator);
+float2 fmod(float2 numerator, float2 denominator);
+float3 fmod(float3 numerator, float3 denominator);
+float4 fmod(float4 numerator, float4 denominator);
+
+float fract(float v, float* floor);
+float2 fract(float2 v, float2* floor);
+float3 fract(float3 v, float3* floor);
+float4 fract(float4 v, float4* floor);
+
+float frexp(float v, int* exponent);
+float2 frexp(float2 v, int2* exponent);
+float3 frexp(float3 v, int3* exponent);
+float4 frexp(float4 v, int4* exponent);
+
+float half_recip(float v);
+float2 half_recip(float2 v);
+float3 half_recip(float3 v);
+float4 half_recip(float4 v);
+
+float half_rsqrt(float v);
+float2 half_rsqrt(float2 v);
+float3 half_rsqrt(float3 v);
+float4 half_rsqrt(float4 v);
+
+float half_sqrt(float v);
+float2 half_sqrt(float2 v);
+float3 half_sqrt(float3 v);
+float4 half_sqrt(float4 v);
+
+float hypot(float a, float b);
+float2 hypot(float2 a, float2 b);
+float3 hypot(float3 a, float3 b);
+float4 hypot(float4 a, float4 b);
+
+int ilogb(float v);
+int2 ilogb(float2 v);
+int3 ilogb(float3 v);
+int4 ilogb(float4 v);
+
+float ldexp(float mantissa, int exponent);
+float2 ldexp(float2 mantissa, int exponent);
+float2 ldexp(float2 mantissa, int2 exponent);
+float3 ldexp(float3 mantissa, int exponent);
+float3 ldexp(float3 mantissa, int3 exponent);
+float4 ldexp(float4 mantissa, int exponent);
+float4 ldexp(float4 mantissa, int4 exponent);
+
+float lgamma(float v);
+float lgamma(float v, int* sign_of_gamma);
+float2 lgamma(float2 v);
+float2 lgamma(float2 v, int2* sign_of_gamma);
+float3 lgamma(float3 v);
+float3 lgamma(float3 v, int3* sign_of_gamma);
+float4 lgamma(float4 v);
+float4 lgamma(float4 v, int4* sign_of_gamma);
+
+float log(float v);
+float2 log(float2 v);
+float3 log(float3 v);
+float4 log(float4 v);
+
+float log10(float v);
+float2 log10(float2 v);
+float3 log10(float3 v);
+float4 log10(float4 v);
+
+float log1p(float v);
+float2 log1p(float2 v);
+float3 log1p(float3 v);
+float4 log1p(float4 v);
+
+float log2(float v);
+float2 log2(float2 v);
+float3 log2(float3 v);
+float4 log2(float4 v);
+
+float logb(float v);
+float2 logb(float2 v);
+float3 logb(float3 v);
+float4 logb(float4 v);
+
+float mad(float multiplicand1, float multiplicand2, float offset);
+float2 mad(float2 multiplicand1, float2 multiplicand2, float2 offset);
+float3 mad(float3 multiplicand1, float3 multiplicand2, float3 offset);
+float4 mad(float4 multiplicand1, float4 multiplicand2, float4 offset);
+
+char max(char a, char b);
+char2 max(char2 a, char2 b);
+char3 max(char3 a, char3 b);
+char4 max(char4 a, char4 b);
+float max(float a, float b);
+float2 max(float2 a, float2 b);
+float3 max(float3 a, float3 b);
+float4 max(float4 a, float4 b);
+int max(int a, int b);
+int2 max(int2 a, int2 b);
+int3 max(int3 a, int3 b);
+int4 max(int4 a, int4 b);
+long max(long a, long b);
+long2 max(long2 a, long2 b);
+long3 max(long3 a, long3 b);
+long4 max(long4 a, long4 b);
+short max(short a, short b);
+short2 max(short2 a, short2 b);
+short3 max(short3 a, short3 b);
+short4 max(short4 a, short4 b);
+uchar max(uchar a, uchar b);
+uchar2 max(uchar2 a, uchar2 b);
+uchar3 max(uchar3 a, uchar3 b);
+uchar4 max(uchar4 a, uchar4 b);
+uint max(uint a, uint b);
+uint2 max(uint2 a, uint2 b);
+uint3 max(uint3 a, uint3 b);
+uint4 max(uint4 a, uint4 b);
+ulong max(ulong a, ulong b);
+ulong2 max(ulong2 a, ulong2 b);
+ulong3 max(ulong3 a, ulong3 b);
+ulong4 max(ulong4 a, ulong4 b);
+ushort max(ushort a, ushort b);
+ushort2 max(ushort2 a, ushort2 b);
+ushort3 max(ushort3 a, ushort3 b);
+ushort4 max(ushort4 a, ushort4 b);
+
+char min(char a, char b);
+char2 min(char2 a, char2 b);
+char3 min(char3 a, char3 b);
+char4 min(char4 a, char4 b);
+float min(float a, float b);
+float2 min(float2 a, float2 b);
+float3 min(float3 a, float3 b);
+float4 min(float4 a, float4 b);
+int min(int a, int b);
+int2 min(int2 a, int2 b);
+int3 min(int3 a, int3 b);
+int4 min(int4 a, int4 b);
+long min(long a, long b);
+long2 min(long2 a, long2 b);
+long3 min(long3 a, long3 b);
+long4 min(long4 a, long4 b);
+short min(short a, short b);
+short2 min(short2 a, short2 b);
+short3 min(short3 a, short3 b);
+short4 min(short4 a, short4 b);
+uchar min(uchar a, uchar b);
+uchar2 min(uchar2 a, uchar2 b);
+uchar3 min(uchar3 a, uchar3 b);
+uchar4 min(uchar4 a, uchar4 b);
+uint min(uint a, uint b);
+uint2 min(uint2 a, uint2 b);
+uint3 min(uint3 a, uint3 b);
+uint4 min(uint4 a, uint4 b);
+ulong min(ulong a, ulong b);
+ulong2 min(ulong2 a, ulong2 b);
+ulong3 min(ulong3 a, ulong3 b);
+ulong4 min(ulong4 a, ulong4 b);
+ushort min(ushort a, ushort b);
+ushort2 min(ushort2 a, ushort2 b);
+ushort3 min(ushort3 a, ushort3 b);
+ushort4 min(ushort4 a, ushort4 b);
+
+float mix(float start, float stop, float fraction);
+float2 mix(float2 start, float2 stop, float fraction);
+float2 mix(float2 start, float2 stop, float2 fraction);
+float3 mix(float3 start, float3 stop, float fraction);
+float3 mix(float3 start, float3 stop, float3 fraction);
+float4 mix(float4 start, float4 stop, float fraction);
+float4 mix(float4 start, float4 stop, float4 fraction);
+
+float modf(float v, float* integral_part);
+float2 modf(float2 v, float2* integral_part);
+float3 modf(float3 v, float3* integral_part);
+float4 modf(float4 v, float4* integral_part);
+
+float nan(uint v);
+
+float native_acos(float v);
+float2 native_acos(float2 v);
+float3 native_acos(float3 v);
+float4 native_acos(float4 v);
+
+float native_acosh(float v);
+float2 native_acosh(float2 v);
+float3 native_acosh(float3 v);
+float4 native_acosh(float4 v);
+
+float native_acospi(float v);
+float2 native_acospi(float2 v);
+float3 native_acospi(float3 v);
+float4 native_acospi(float4 v);
+
+float native_asin(float v);
+float2 native_asin(float2 v);
+float3 native_asin(float3 v);
+float4 native_asin(float4 v);
+
+float native_asinh(float v);
+float2 native_asinh(float2 v);
+float3 native_asinh(float3 v);
+float4 native_asinh(float4 v);
+
+float native_asinpi(float v);
+float2 native_asinpi(float2 v);
+float3 native_asinpi(float3 v);
+float4 native_asinpi(float4 v);
+
+float native_atan(float v);
+float2 native_atan(float2 v);
+float3 native_atan(float3 v);
+float4 native_atan(float4 v);
+
+float native_atan2(float numerator, float denominator);
+float2 native_atan2(float2 numerator, float2 denominator);
+float3 native_atan2(float3 numerator, float3 denominator);
+float4 native_atan2(float4 numerator, float4 denominator);
+
+float native_atan2pi(float numerator, float denominator);
+float2 native_atan2pi(float2 numerator, float2 denominator);
+float3 native_atan2pi(float3 numerator, float3 denominator);
+float4 native_atan2pi(float4 numerator, float4 denominator);
+
+float native_atanh(float v);
+float2 native_atanh(float2 v);
+float3 native_atanh(float3 v);
+float4 native_atanh(float4 v);
+
+float native_atanpi(float v);
+float2 native_atanpi(float2 v);
+float3 native_atanpi(float3 v);
+float4 native_atanpi(float4 v);
+
+float native_cbrt(float v);
+float2 native_cbrt(float2 v);
+float3 native_cbrt(float3 v);
+float4 native_cbrt(float4 v);
+
+float native_cos(float v);
+float2 native_cos(float2 v);
+float3 native_cos(float3 v);
+float4 native_cos(float4 v);
+
+float native_cosh(float v);
+float2 native_cosh(float2 v);
+float3 native_cosh(float3 v);
+float4 native_cosh(float4 v);
+
+float native_cospi(float v);
+float2 native_cospi(float2 v);
+float3 native_cospi(float3 v);
+float4 native_cospi(float4 v);
+
+float native_divide(float left_vector, float right_vector);
+float2 native_divide(float2 left_vector, float2 right_vector);
+float3 native_divide(float3 left_vector, float3 right_vector);
+float4 native_divide(float4 left_vector, float4 right_vector);
+
+float native_exp(float v);
+float2 native_exp(float2 v);
+float3 native_exp(float3 v);
+float4 native_exp(float4 v);
+
+float native_exp10(float v);
+float2 native_exp10(float2 v);
+float3 native_exp10(float3 v);
+float4 native_exp10(float4 v);
+
+float native_exp2(float v);
+float2 native_exp2(float2 v);
+float3 native_exp2(float3 v);
+float4 native_exp2(float4 v);
+
+float native_expm1(float v);
+float2 native_expm1(float2 v);
+float3 native_expm1(float3 v);
+float4 native_expm1(float4 v);
+
+float native_hypot(float a, float b);
+float2 native_hypot(float2 a, float2 b);
+float3 native_hypot(float3 a, float3 b);
+float4 native_hypot(float4 a, float4 b);
+
+float native_log(float v);
+float2 native_log(float2 v);
+float3 native_log(float3 v);
+float4 native_log(float4 v);
+
+float native_log10(float v);
+float2 native_log10(float2 v);
+float3 native_log10(float3 v);
+float4 native_log10(float4 v);
+
+float native_log1p(float v);
+float2 native_log1p(float2 v);
+float3 native_log1p(float3 v);
+float4 native_log1p(float4 v);
+
+float native_log2(float v);
+float2 native_log2(float2 v);
+float3 native_log2(float3 v);
+float4 native_log2(float4 v);
+
+float native_powr(float base, float exponent);
+float2 native_powr(float2 base, float2 exponent);
+float3 native_powr(float3 base, float3 exponent);
+float4 native_powr(float4 base, float4 exponent);
+
+float native_recip(float v);
+float2 native_recip(float2 v);
+float3 native_recip(float3 v);
+float4 native_recip(float4 v);
+
+float native_rootn(float v, int n);
+float2 native_rootn(float2 v, int2 n);
+float3 native_rootn(float3 v, int3 n);
+float4 native_rootn(float4 v, int4 n);
+
+float native_rsqrt(float v);
+float2 native_rsqrt(float2 v);
+float3 native_rsqrt(float3 v);
+float4 native_rsqrt(float4 v);
+
+float native_sin(float v);
+float2 native_sin(float2 v);
+float3 native_sin(float3 v);
+float4 native_sin(float4 v);
+
+float native_sincos(float v, float* cos);
+float2 native_sincos(float2 v, float2* cos);
+float3 native_sincos(float3 v, float3* cos);
+float4 native_sincos(float4 v, float4* cos);
+
+float native_sinh(float v);
+float2 native_sinh(float2 v);
+float3 native_sinh(float3 v);
+float4 native_sinh(float4 v);
+
+float native_sinpi(float v);
+float2 native_sinpi(float2 v);
+float3 native_sinpi(float3 v);
+float4 native_sinpi(float4 v);
+
+float native_sqrt(float v);
+float2 native_sqrt(float2 v);
+float3 native_sqrt(float3 v);
+float4 native_sqrt(float4 v);
+
+float native_tan(float v);
+float2 native_tan(float2 v);
+float3 native_tan(float3 v);
+float4 native_tan(float4 v);
+
+float native_tanh(float v);
+float2 native_tanh(float2 v);
+float3 native_tanh(float3 v);
+float4 native_tanh(float4 v);
+
+float native_tanpi(float v);
+float2 native_tanpi(float2 v);
+float3 native_tanpi(float3 v);
+float4 native_tanpi(float4 v);
+
+float nextafter(float v, float target);
+float2 nextafter(float2 v, float2 target);
+float3 nextafter(float3 v, float3 target);
+float4 nextafter(float4 v, float4 target);
+
+float pow(float base, float exponent);
+float2 pow(float2 base, float2 exponent);
+float3 pow(float3 base, float3 exponent);
+float4 pow(float4 base, float4 exponent);
+
+float pown(float base, int exponent);
+float2 pown(float2 base, int2 exponent);
+float3 pown(float3 base, int3 exponent);
+float4 pown(float4 base, int4 exponent);
+
+float powr(float base, float exponent);
+float2 powr(float2 base, float2 exponent);
+float3 powr(float3 base, float3 exponent);
+float4 powr(float4 base, float4 exponent);
+
+float radians(float v);
+float2 radians(float2 v);
+float3 radians(float3 v);
+float4 radians(float4 v);
+
+float remainder(float numerator, float denominator);
+float2 remainder(float2 numerator, float2 denominator);
+float3 remainder(float3 numerator, float3 denominator);
+float4 remainder(float4 numerator, float4 denominator);
+
+float remquo(float numerator, float denominator, int* quotient);
+float2 remquo(float2 numerator, float2 denominator, int2* quotient);
+float3 remquo(float3 numerator, float3 denominator, int3* quotient);
+float4 remquo(float4 numerator, float4 denominator, int4* quotient);
+
+float rint(float v);
+float2 rint(float2 v);
+float3 rint(float3 v);
+float4 rint(float4 v);
+
+float rootn(float v, int n);
+float2 rootn(float2 v, int2 n);
+float3 rootn(float3 v, int3 n);
+float4 rootn(float4 v, int4 n);
+
+float round(float v);
+float2 round(float2 v);
+float3 round(float3 v);
+float4 round(float4 v);
+
+char rsClamp(char amount, char low, char high);
+int rsClamp(int amount, int low, int high);
+short rsClamp(short amount, short low, short high);
+uchar rsClamp(uchar amount, uchar low, uchar high);
+uint rsClamp(uint amount, uint low, uint high);
+ushort rsClamp(ushort amount, ushort low, ushort high);
+
+float rsFrac(float v);
+
+float rsRand(float max_value);
+float rsRand(float min_value, float max_value);
+int rsRand(int max_value);
+int rsRand(int min_value, int max_value);
+
+float rsqrt(float v);
+float2 rsqrt(float2 v);
+float3 rsqrt(float3 v);
+float4 rsqrt(float4 v);
+
+float sign(float v);
+float2 sign(float2 v);
+float3 sign(float3 v);
+float4 sign(float4 v);
+
+float sin(float v);
+float2 sin(float2 v);
+float3 sin(float3 v);
+float4 sin(float4 v);
+
+float sincos(float v, float* cos);
+float2 sincos(float2 v, float2* cos);
+float3 sincos(float3 v, float3* cos);
+float4 sincos(float4 v, float4* cos);
+
+float sinh(float v);
+float2 sinh(float2 v);
+float3 sinh(float3 v);
+float4 sinh(float4 v);
+
+float sinpi(float v);
+float2 sinpi(float2 v);
+float3 sinpi(float3 v);
+float4 sinpi(float4 v);
+
+float sqrt(float v);
+float2 sqrt(float2 v);
+float3 sqrt(float3 v);
+float4 sqrt(float4 v);
+
+float step(float edge, float v);
+float2 step(float edge, float2 v);
+float2 step(float2 edge, float v);
+float2 step(float2 edge, float2 v);
+float3 step(float edge, float3 v);
+float3 step(float3 edge, float v);
+float3 step(float3 edge, float3 v);
+float4 step(float edge, float4 v);
+float4 step(float4 edge, float v);
+float4 step(float4 edge, float4 v);
+
+float tan(float v);
+float2 tan(float2 v);
+float3 tan(float3 v);
+float4 tan(float4 v);
+
+float tanh(float v);
+float2 tanh(float2 v);
+float3 tanh(float3 v);
+float4 tanh(float4 v);
+
+float tanpi(float v);
+float2 tanpi(float2 v);
+float3 tanpi(float3 v);
+float4 tanpi(float4 v);
+
+float tgamma(float v);
+float2 tgamma(float2 v);
+float3 tgamma(float3 v);
+float4 tgamma(float4 v);
+
+float trunc(float v);
+float2 trunc(float2 v);
+float3 trunc(float3 v);
+float4 trunc(float4 v);
+
+uchar4 rsPackColorTo8888(float r, float g, float b);
+uchar4 rsPackColorTo8888(float r, float g, float b, float a);
+uchar4 rsPackColorTo8888(float3 color);
+uchar4 rsPackColorTo8888(float4 color);
+float4 rsUnpackColor8888(uchar4 c);
+
+float4 rsYuvToRGBA_float4(uchar y, uchar u, uchar v);
+uchar4 rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v);
+
+- vector functions
+
+float3 cross(float3 left_vector, float3 right_vector);
+float4 cross(float4 left_vector, float4 right_vector);
+
+float distance(float  left_vector, float  right_vector);
+float distance(float2 left_vector, float2 right_vector);
+float distance(float3 left_vector, float3 right_vector);
+float distance(float4 left_vector, float4 right_vector);
+
+float dot(float  left_vector, float  right_vector);
+float dot(float2 left_vector, float2 right_vector);
+float dot(float3 left_vector, float3 right_vector);
+float dot(float4 left_vector, float4 right_vector);
+
+float fast_distance(float  left_vector, float  right_vector);
+float fast_distance(float2 left_vector, float2 right_vector);
+float fast_distance(float3 left_vector, float3 right_vector);
+float fast_distance(float4 left_vector, float4 right_vector);
+
+float fast_length(float  v);
+float fast_length(float2 v);
+float fast_length(float3 v);
+float fast_length(float4 v);
+
+float  fast_normalize(float  v);
+float2 fast_normalize(float2 v);
+float3 fast_normalize(float3 v);
+float4 fast_normalize(float4 v);
+
+float length(float  v);
+float length(float2 v);
+float length(float3 v);
+float length(float4 v);
+
+float native_distance(float  left_vector, float  right_vector);
+float native_distance(float2 left_vector, float2 right_vector);
+float native_distance(float3 left_vector, float3 right_vector);
+float native_distance(float4 left_vector, float4 right_vector);
+
+float native_length(float  v);
+float native_length(float2 v);
+float native_length(float3 v);
+float native_length(float4 v);
+
+float  native_normalize(float  v);
+float2 native_normalize(float2 v);
+float3 native_normalize(float3 v);
+float4 native_normalize(float4 v);
+
+float  normalize(float  v);
+float2 normalize(float2 v);
+float3 normalize(float3 v);
+float4 normalize(float4 v);
+
+- conversion functions
+
+char2 convert_char2(char2 v);
+char2 convert_char2(double2 v);
+char2 convert_char2(float2 v);
+char2 convert_char2(int2 v);
+char2 convert_char2(long2 v);
+char2 convert_char2(short2 v);
+char2 convert_char2(uchar2 v);
+char2 convert_char2(uint2 v);
+char2 convert_char2(ulong2 v);
+char2 convert_char2(ushort2 v);
+
+char3 convert_char3(char3 v);
+char3 convert_char3(double3 v);
+char3 convert_char3(float3 v);
+char3 convert_char3(int3 v);
+char3 convert_char3(long3 v);
+char3 convert_char3(short3 v);
+char3 convert_char3(uchar3 v);
+char3 convert_char3(uint3 v);
+char3 convert_char3(ulong3 v);
+char3 convert_char3(ushort3 v);
+
+char4 convert_char4(char4 v);
+char4 convert_char4(double4 v);
+char4 convert_char4(float4 v);
+char4 convert_char4(int4 v);
+char4 convert_char4(long4 v);
+char4 convert_char4(short4 v);
+char4 convert_char4(uchar4 v);
+char4 convert_char4(uint4 v);
+char4 convert_char4(ulong4 v);
+char4 convert_char4(ushort4 v);
+
+double2 convert_double2(char2 v);
+double2 convert_double2(double2 v);
+double2 convert_double2(float2 v);
+double2 convert_double2(int2 v);
+double2 convert_double2(long2 v);
+double2 convert_double2(short2 v);
+double2 convert_double2(uchar2 v);
+double2 convert_double2(uint2 v);
+double2 convert_double2(ulong2 v);
+double2 convert_double2(ushort2 v);
+
+double3 convert_double3(char3 v);
+double3 convert_double3(double3 v);
+double3 convert_double3(float3 v);
+double3 convert_double3(int3 v);
+double3 convert_double3(long3 v);
+double3 convert_double3(short3 v);
+double3 convert_double3(uchar3 v);
+double3 convert_double3(uint3 v);
+double3 convert_double3(ulong3 v);
+double3 convert_double3(ushort3 v);
+
+double4 convert_double4(char4 v);
+double4 convert_double4(double4 v);
+double4 convert_double4(float4 v);
+double4 convert_double4(int4 v);
+double4 convert_double4(long4 v);
+double4 convert_double4(short4 v);
+double4 convert_double4(uchar4 v);
+double4 convert_double4(uint4 v);
+double4 convert_double4(ulong4 v);
+double4 convert_double4(ushort4 v);
+
+float2 convert_float2(char2 v);
+float2 convert_float2(double2 v);
+float2 convert_float2(float2 v);
+float2 convert_float2(int2 v);
+float2 convert_float2(long2 v);
+float2 convert_float2(short2 v);
+float2 convert_float2(uchar2 v);
+float2 convert_float2(uint2 v);
+float2 convert_float2(ulong2 v);
+float2 convert_float2(ushort2 v);
+
+float3 convert_float3(char3 v);
+float3 convert_float3(double3 v);
+float3 convert_float3(float3 v);
+float3 convert_float3(int3 v);
+float3 convert_float3(long3 v);
+float3 convert_float3(short3 v);
+float3 convert_float3(uchar3 v);
+float3 convert_float3(uint3 v);
+float3 convert_float3(ulong3 v);
+float3 convert_float3(ushort3 v);
+
+float4 convert_float4(char4 v);
+float4 convert_float4(double4 v);
+float4 convert_float4(float4 v);
+float4 convert_float4(int4 v);
+float4 convert_float4(long4 v);
+float4 convert_float4(short4 v);
+float4 convert_float4(uchar4 v);
+float4 convert_float4(uint4 v);
+float4 convert_float4(ulong4 v);
+float4 convert_float4(ushort4 v);
+
+int2 convert_int2(char2 v);
+int2 convert_int2(double2 v);
+int2 convert_int2(float2 v);
+int2 convert_int2(int2 v);
+int2 convert_int2(long2 v);
+int2 convert_int2(short2 v);
+int2 convert_int2(uchar2 v);
+int2 convert_int2(uint2 v);
+int2 convert_int2(ulong2 v);
+int2 convert_int2(ushort2 v);
+
+int3 convert_int3(char3 v);
+int3 convert_int3(double3 v);
+int3 convert_int3(float3 v);
+int3 convert_int3(int3 v);
+int3 convert_int3(long3 v);
+int3 convert_int3(short3 v);
+int3 convert_int3(uchar3 v);
+int3 convert_int3(uint3 v);
+int3 convert_int3(ulong3 v);
+int3 convert_int3(ushort3 v);
+
+int4 convert_int4(char4 v);
+int4 convert_int4(double4 v);
+int4 convert_int4(float4 v);
+int4 convert_int4(int4 v);
+int4 convert_int4(long4 v);
+int4 convert_int4(short4 v);
+int4 convert_int4(uchar4 v);
+int4 convert_int4(uint4 v);
+int4 convert_int4(ulong4 v);
+int4 convert_int4(ushort4 v);
+
+long2 convert_long2(char2 v);
+long2 convert_long2(double2 v);
+long2 convert_long2(float2 v);
+long2 convert_long2(int2 v);
+long2 convert_long2(long2 v);
+long2 convert_long2(short2 v);
+long2 convert_long2(uchar2 v);
+long2 convert_long2(uint2 v);
+long2 convert_long2(ulong2 v);
+long2 convert_long2(ushort2 v);
+
+long3 convert_long3(char3 v);
+long3 convert_long3(double3 v);
+long3 convert_long3(float3 v);
+long3 convert_long3(int3 v);
+long3 convert_long3(long3 v);
+long3 convert_long3(short3 v);
+long3 convert_long3(uchar3 v);
+long3 convert_long3(uint3 v);
+long3 convert_long3(ulong3 v);
+long3 convert_long3(ushort3 v);
+
+long4 convert_long4(char4 v);
+long4 convert_long4(double4 v);
+long4 convert_long4(float4 v);
+long4 convert_long4(int4 v);
+long4 convert_long4(long4 v);
+long4 convert_long4(short4 v);
+long4 convert_long4(uchar4 v);
+long4 convert_long4(uint4 v);
+long4 convert_long4(ulong4 v);
+long4 convert_long4(ushort4 v);
+
+short2 convert_short2(char2 v);
+short2 convert_short2(double2 v);
+short2 convert_short2(float2 v);
+short2 convert_short2(int2 v);
+short2 convert_short2(long2 v);
+short2 convert_short2(short2 v);
+short2 convert_short2(uchar2 v);
+short2 convert_short2(uint2 v);
+short2 convert_short2(ulong2 v);
+short2 convert_short2(ushort2 v);
+
+short3 convert_short3(char3 v);
+short3 convert_short3(double3 v);
+short3 convert_short3(float3 v);
+short3 convert_short3(int3 v);
+short3 convert_short3(long3 v);
+short3 convert_short3(short3 v);
+short3 convert_short3(uchar3 v);
+short3 convert_short3(uint3 v);
+short3 convert_short3(ulong3 v);
+short3 convert_short3(ushort3 v);
+
+short4 convert_short4(char4 v);
+short4 convert_short4(double4 v);
+short4 convert_short4(float4 v);
+short4 convert_short4(int4 v);
+short4 convert_short4(long4 v);
+short4 convert_short4(short4 v);
+short4 convert_short4(uchar4 v);
+short4 convert_short4(uint4 v);
+short4 convert_short4(ulong4 v);
+short4 convert_short4(ushort4 v);
+
+uchar2 convert_uchar2(char2 v);
+uchar2 convert_uchar2(double2 v);
+uchar2 convert_uchar2(float2 v);
+uchar2 convert_uchar2(int2 v);
+uchar2 convert_uchar2(long2 v);
+uchar2 convert_uchar2(short2 v);
+uchar2 convert_uchar2(uchar2 v);
+uchar2 convert_uchar2(uint2 v);
+uchar2 convert_uchar2(ulong2 v);
+uchar2 convert_uchar2(ushort2 v);
+
+uchar3 convert_uchar3(char3 v);
+uchar3 convert_uchar3(double3 v);
+uchar3 convert_uchar3(float3 v);
+uchar3 convert_uchar3(int3 v);
+uchar3 convert_uchar3(long3 v);
+uchar3 convert_uchar3(short3 v);
+uchar3 convert_uchar3(uchar3 v);
+uchar3 convert_uchar3(uint3 v);
+uchar3 convert_uchar3(ulong3 v);
+uchar3 convert_uchar3(ushort3 v);
+
+uchar4 convert_uchar4(char4 v);
+uchar4 convert_uchar4(double4 v);
+uchar4 convert_uchar4(float4 v);
+uchar4 convert_uchar4(int4 v);
+uchar4 convert_uchar4(long4 v);
+uchar4 convert_uchar4(short4 v);
+uchar4 convert_uchar4(uchar4 v);
+uchar4 convert_uchar4(uint4 v);
+uchar4 convert_uchar4(ulong4 v);
+uchar4 convert_uchar4(ushort4 v);
+
+uint2 convert_uint2(char2 v);
+uint2 convert_uint2(double2 v);
+uint2 convert_uint2(float2 v);
+uint2 convert_uint2(int2 v);
+uint2 convert_uint2(long2 v);
+uint2 convert_uint2(short2 v);
+uint2 convert_uint2(uchar2 v);
+uint2 convert_uint2(uint2 v);
+uint2 convert_uint2(ulong2 v);
+uint2 convert_uint2(ushort2 v);
+
+uint3 convert_uint3(char3 v);
+uint3 convert_uint3(double3 v);
+uint3 convert_uint3(float3 v);
+uint3 convert_uint3(int3 v);
+uint3 convert_uint3(long3 v);
+uint3 convert_uint3(short3 v);
+uint3 convert_uint3(uchar3 v);
+uint3 convert_uint3(uint3 v);
+uint3 convert_uint3(ulong3 v);
+uint3 convert_uint3(ushort3 v);
+
+uint4 convert_uint4(char4 v);
+uint4 convert_uint4(double4 v);
+uint4 convert_uint4(float4 v);
+uint4 convert_uint4(int4 v);
+uint4 convert_uint4(long4 v);
+uint4 convert_uint4(short4 v);
+uint4 convert_uint4(uchar4 v);
+uint4 convert_uint4(uint4 v);
+uint4 convert_uint4(ulong4 v);
+uint4 convert_uint4(ushort4 v);
+
+ulong2 convert_ulong2(char2 v);
+ulong2 convert_ulong2(double2 v);
+ulong2 convert_ulong2(float2 v);
+ulong2 convert_ulong2(int2 v);
+ulong2 convert_ulong2(long2 v);
+ulong2 convert_ulong2(short2 v);
+ulong2 convert_ulong2(uchar2 v);
+ulong2 convert_ulong2(uint2 v);
+ulong2 convert_ulong2(ulong2 v);
+ulong2 convert_ulong2(ushort2 v);
+
+ulong3 convert_ulong3(char3 v);
+ulong3 convert_ulong3(double3 v);
+ulong3 convert_ulong3(float3 v);
+ulong3 convert_ulong3(int3 v);
+ulong3 convert_ulong3(long3 v);
+ulong3 convert_ulong3(short3 v);
+ulong3 convert_ulong3(uchar3 v);
+ulong3 convert_ulong3(uint3 v);
+ulong3 convert_ulong3(ulong3 v);
+ulong3 convert_ulong3(ushort3 v);
+
+ulong4 convert_ulong4(char4 v);
+ulong4 convert_ulong4(double4 v);
+ulong4 convert_ulong4(float4 v);
+ulong4 convert_ulong4(int4 v);
+ulong4 convert_ulong4(long4 v);
+ulong4 convert_ulong4(short4 v);
+ulong4 convert_ulong4(uchar4 v);
+ulong4 convert_ulong4(uint4 v);
+ulong4 convert_ulong4(ulong4 v);
+ulong4 convert_ulong4(ushort4 v);
+
+ushort2 convert_ushort2(char2 v);
+ushort2 convert_ushort2(double2 v);
+ushort2 convert_ushort2(float2 v);
+ushort2 convert_ushort2(int2 v);
+ushort2 convert_ushort2(long2 v);
+ushort2 convert_ushort2(short2 v);
+ushort2 convert_ushort2(uchar2 v);
+ushort2 convert_ushort2(uint2 v);
+ushort2 convert_ushort2(ulong2 v);
+ushort2 convert_ushort2(ushort2 v);
+
+ushort3 convert_ushort3(char3 v);
+ushort3 convert_ushort3(double3 v);
+ushort3 convert_ushort3(float3 v);
+ushort3 convert_ushort3(int3 v);
+ushort3 convert_ushort3(long3 v);
+ushort3 convert_ushort3(short3 v);
+ushort3 convert_ushort3(uchar3 v);
+ushort3 convert_ushort3(uint3 v);
+ushort3 convert_ushort3(ulong3 v);
+ushort3 convert_ushort3(ushort3 v);
+
+ushort4 convert_ushort4(char4 v);
+ushort4 convert_ushort4(double4 v);
+ushort4 convert_ushort4(float4 v);
+ushort4 convert_ushort4(int4 v);
+ushort4 convert_ushort4(long4 v);
+ushort4 convert_ushort4(short4 v);
+ushort4 convert_ushort4(uchar4 v);
+ushort4 convert_ushort4(uint4 v);
+ushort4 convert_ushort4(ulong4 v);
+ushort4 convert_ushort4(ushort4 v);
+
+uchar4 rsPackColorTo8888(float r, float g, float b);
+uchar4 rsPackColorTo8888(float r, float g, float b, float a);
+uchar4 rsPackColorTo8888(float3 color);
+uchar4 rsPackColorTo8888(float4 color);
+
+float4 rsUnpackColor8888(uchar4 c);
+
+float4 rsYuvToRGBA_float4(uchar y, uchar u, uchar v);
+
+uchar4 rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v);
+
+- matrix functions, some of these are not supported yet
+
+-bool rsMatrixInverse(rs_matrix4x4* m);
+
+-bool rsMatrixInverseTranspose(rs_matrix4x4* m);
+
+-void rsMatrixLoad(rs_matrix2x2* destination, float* array);
+-void rsMatrixLoad(rs_matrix2x2* destination, rs_matrix2x2* source);
+-void rsMatrixLoad(rs_matrix3x3* destination, float* array);
+-void rsMatrixLoad(rs_matrix3x3* destination, rs_matrix3x3* source);
+-void rsMatrixLoad(rs_matrix4x4* destination, float* array);
+-void rsMatrixLoad(rs_matrix4x4* destination, rs_matrix2x2* source);
+-void rsMatrixLoad(rs_matrix4x4* destination, rs_matrix3x3* source);
+-void rsMatrixLoad(rs_matrix4x4* destination, rs_matrix4x4* source);
+
+-void rsMatrixLoadFrustum(rs_matrix4x4* m, float left, float right, float bottom, float top, float near, float far);
+
+-void rsMatrixLoadIdentity(rs_matrix2x2* m);
+-void rsMatrixLoadIdentity(rs_matrix3x3* m);
+-void rsMatrixLoadIdentity(rs_matrix4x4* m);
+
+-void rsMatrixLoadMultiply(rs_matrix2x2* m, rs_matrix2x2* lhs, rs_matrix2x2* rhs);
+-void rsMatrixLoadMultiply(rs_matrix3x3* m, rs_matrix3x3* lhs, rs_matrix3x3* rhs);
+-void rsMatrixLoadMultiply(rs_matrix4x4* m, rs_matrix4x4* lhs, rs_matrix4x4* rhs);
+
+-void rsMatrixLoadOrtho(rs_matrix4x4* m, float left, float right, float bottom, float top, float near, float far);
+
+-void rsMatrixLoadPerspective(rs_matrix4x4* m, float fovy, float aspect, float near, float far);
+
+-void rsMatrixLoadRotate(rs_matrix4x4* m, float rot, float x, float y, float z);
+
+-void rsMatrixLoadScale(rs_matrix4x4* m, float x, float y, float z);
+
+-void rsMatrixLoadTranslate(rs_matrix4x4* m, float x, float y, float z);
+
+float2 rsMatrixMultiply(rs_matrix2x2* m, float2 in);
+float3 rsMatrixMultiply(rs_matrix3x3* m, float2 in);
+float3 rsMatrixMultiply(rs_matrix3x3* m, float3 in);
+float4 rsMatrixMultiply(rs_matrix4x4* m, float2 in);
+float4 rsMatrixMultiply(rs_matrix4x4* m, float3 in);
+float4 rsMatrixMultiply(rs_matrix4x4* m, float4 in);
+void rsMatrixMultiply(rs_matrix2x2* m, rs_matrix2x2* rhs);
+void rsMatrixMultiply(rs_matrix3x3* m, rs_matrix3x3* rhs);
+void rsMatrixMultiply(rs_matrix4x4* m, rs_matrix4x4* rhs);
+
+-void rsMatrixRotate(rs_matrix4x4* m, float rot, float x, float y, float z);
+
+-void rsMatrixScale(rs_matrix4x4* m, float x, float y, float z);
+
+void rsMatrixSet(rs_matrix2x2* m, 0, 1, float v);
+void rsMatrixSet(rs_matrix3x3* m, 2, 0, float v);
+void rsMatrixSet(rs_matrix4x4* m, 1, 3, float v);
+
+-void rsMatrixTranslate(rs_matrix4x4* m, float x, float y, float z);
+
+-void rsMatrixTranspose(rs_matrix2x2* m);
+-void rsMatrixTranspose(rs_matrix3x3* m);
+-void rsMatrixTranspose(rs_matrix4x4* m);
+
+- quaternion functions
+
+void rsQuaternionAdd(rs_quaternion* q, rs_quaternion* rhs);
+
+void rsQuaternionConjugate(rs_quaternion* q);
+
+float rsQuaternionDot(rs_quaternion* q0, rs_quaternion* q1);
+
+void rsQuaternionGetMatrixUnit(rs_matrix4x4* m, rs_quaternion* q);
+
+void rsQuaternionLoadRotate(rs_quaternion* q, float rot, float x, float y, float z);
+
+void rsQuaternionLoadRotateUnit(rs_quaternion* q, float rot, float x, float y, float z);
+
+void rsQuaternionMultiply(rs_quaternion* q, rs_quaternion* rhs);
+void rsQuaternionMultiply(rs_quaternion* q, float scalar);
+
+void rsQuaternionNormalize(rs_quaternion* q);
+
+void rsQuaternionSet(rs_quaternion* q, rs_quaternion* rhs);
+void rsQuaternionSet(rs_quaternion* q, float w, float x, float y, float z);
+
+void rsQuaternionSlerp(rs_quaternion* q, rs_quaternion* q0, rs_quaternion* q1, float t);
+
+- allocation data access functions, this is a subset, since we don't have all types of allocations
+
+-void rsAllocationCopy1DRange(allocation_1D_global, uint dstOff, uint dstMip, uint count, allocation_1D_global2, uint srcOff, uint srcMip);
+
+-void rsAllocationCopy2DRange(allocation_2D_global, uint dstXoff, uint dstYoff, uint dstMip, rs_allocation_cubemap_face dstFace, uint width, uint height, allocation_2D_global2, uint srcXoff, uint srcYoff, uint srcMip, rs_allocation_cubemap_face srcFace);
+
+int2 rsAllocationVLoadX_int2(allocation_1D_global, 0);
+int2 rsAllocationVLoadX_int2(allocation_2D_global, 24, 25);
+int2 rsAllocationVLoadX_int2(allocation_3D_global, 0, 1, 0);
+int3 rsAllocationVLoadX_int3(allocation_1D_global, 1);
+int3 rsAllocationVLoadX_int3(allocation_2D_global, 27, 28);
+int3 rsAllocationVLoadX_int3(allocation_3D_global, 1, 0, 1);
+int4 rsAllocationVLoadX_int4(allocation_1D_global, 0);
+int4 rsAllocationVLoadX_int4(allocation_2D_global, 29, 30);
+int4 rsAllocationVLoadX_int4(allocation_3D_global, 0, 1, 0);
+
+void rsAllocationVStoreX_int2(allocation_1D_global, int2 val, 2);
+void rsAllocationVStoreX_int2(allocation_2D_global, int2 val, 6, 7);
+void rsAllocationVStoreX_int2(allocation_3D_global, int2 val, 0, 1, 0);
+void rsAllocationVStoreX_int3(allocation_1D_global, int3 val, 1);
+void rsAllocationVStoreX_int3(allocation_2D_global, int3 val, 12, 13);
+void rsAllocationVStoreX_int3(allocation_3D_global, int3 val, 1, 0, 1);
+void rsAllocationVStoreX_int4(allocation_1D_global, int4 val, 0);
+void rsAllocationVStoreX_int4(allocation_2D_global, int4 val, 18, 19);
+void rsAllocationVStoreX_int4(allocation_3D_global, int4 val, 0, 1, 0);
+
+void* rsGetElementAt(allocation_1D_global, 0);
+void* rsGetElementAt(allocation_2D_global, 20, 21);
+void* rsGetElementAt(allocation_3D_global, 1, 0, 1);
+int rsGetElementAt_int(allocation_1D_global, 1);
+int rsGetElementAt_int(allocation_2D_global, 22, 23);
+int rsGetElementAt_int(allocation_3D_global, 0, 1, 0);
+
+uchar rsGetElementAtYuv_uchar_U(allocation_YUV_2D_global, 0, 1);
+
+uchar rsGetElementAtYuv_uchar_V(allocation_YUV_2D_global, 2, 3);
+
+uchar rsGetElementAtYuv_uchar_Y(allocation_YUV_2D_global, 0, 1);
+
+float4 rsSample(allocation_1D_global, rs_sampler s, float location);
+float4 rsSample(allocation_1D_global, rs_sampler s, float location, float lod);
+float4 rsSample(allocation_1D_global, rs_sampler s, float2 location);
+float4 rsSample(allocation_1D_global, rs_sampler s, float2 location, float lod);
+
+void rsSetElementAt(allocation_1D_global, int* ptr, 2);
+void rsSetElementAt(allocation_2D_global, int* ptr, 24, 25);
+void rsSetElementAt_int(allocation_1D_global, int val, 0);
+void rsSetElementAt_int(allocation_2D_global, int val, 26, 27);
+void rsSetElementAt_int(allocation_3D_global, int val, 1, 0, 1);
+'''.splitlines())
+
+
+TYPE_MAP = {
+    'void' : '',
+    'char' : r'\((signed )?char\)',
+    'uchar' : r'\(uchar\)',
+    'short' : r'\(short\)',
+    'ushort' : r'\(ushort\)',
+    'int' : r'\(int\)',
+    'uint' : r'\(uint\)',
+    'long' : r'\((long )?long\)',
+    'ulong' : r'\(ulong\)',
+    'float' : r'\(float\)',
+    'float2' : r'\(float2\)',
+    'float3' : r'\(float3\)',
+    'float4' : r'\(float4\)'
+    }
+
+
+def _build_arg(token):
+    '''Given a C argument construct an lldb expression for the argument.
+
+    Given a token, which represents a single argument of a C function
+    declaration, construct an lldb expression for the argument.
+
+    Args:
+        token: A string representing a single argument to a function. This
+               can be either [type][name] (e.g. int arg) or [value] (e.g. 5).
+
+    Returns:
+        The string that is the lldb expression for that argument, e.g.
+        int_global or 5.
+    '''
+    if len(token.split()) == 1:
+        # This is a fixed constant. Just take that as output.
+        return token
+
+    result = token.split()[0]
+
+    # Remove the rs_ prefix, because our globals' names don't have them
+    if result[:3] == 'rs_':
+        result = result[3:]
+
+    # If the function expects a pointer, take the address of the global
+    if result[-1] == '*':
+        result = '&' + result
+        result = result[:-1]
+
+    result += '_global'
+    return result
+
+
+def build_expr(line):
+    '''Build an lldb expression given a function prototype.
+
+    Given a function declaration, this function will construct an lldb
+    expression to call it.
+
+    Args:
+        line: A string representing a function declaration.
+
+    Returns:
+        The string that is the lldb expression.
+    '''
+    tokens = re.findall(r"[^(),;]+", line)
+    assert len(tokens) > 0
+    ret_name = tokens[0].split()
+    ret = ret_name[0]
+    name = ret_name[1]
+    expr = 'expr {0}('.format(name)
+
+    first = True
+    for tok in tokens[1:]:
+        if not first:
+            expr += ', '
+        expr += _build_arg(tok)
+        first = False
+
+    expr += ')'
+    return ret, expr

diff --git a/tests/lldb/tests/harness/__init__.py b/tests/lldb/tests/harness/__init__.py
new file mode 100644
index 0000000..863ac22
--- /dev/null
+++ b/tests/lldb/tests/harness/__init__.py

@@ -0,0 +1,25 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''This module contains utility code used by the test suite'''
+
+from __future__ import absolute_import
+
+from . import exception
+from . import util_constants
+from .exception import TestSuiteException
+from .test_base import TestBase
+from .util_android import UtilAndroid
+from .util_bundle import UtilBundle
+from . import RS_funs
\ No newline at end of file

diff --git a/tests/lldb/tests/harness/assert_mixins.py b/tests/lldb/tests/harness/assert_mixins.py
new file mode 100644
index 0000000..94a9e22
--- /dev/null
+++ b/tests/lldb/tests/harness/assert_mixins.py

@@ -0,0 +1,102 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Set of mixins for asserting common RenderScript lldb interactions
+That should cut down boilerplate
+To use these assertions simply inherit from them along with your
+`TestBase`:
+
+    >>> class MyLLDBRenderScriptTest(TestBaseRemote, CoordinateAssertionsMixin):
+    >>>     pass
+
+This will give you access to the useful assertion methods related to Coordinates
+
+NOTE: These are strictly clean mixins for `TestBase`. All classes here should
+strictly inherit only from `object`
+"""
+
+
+class CoordinateAssertionsMixin(object):
+    def assert_coord_bp_set(
+            self, breakpoint_expr, x, y=None, z=None, kernel_type='kernel'
+        ):
+        '''
+        Assert that a breakpoint conditional on a given coordinate is confirmed
+        by the renderscript breakpoint resolver.
+        This does not assert test the breakpoint is hit, only registered.
+            breakpoint_expr: the expression (e.g. the name of a function, or a
+            file and line).
+            kernel_type: The breakpoint resolver to use:
+                (reduction|kernel|scriptgroup)
+                default='kernel'
+            x: x coordinate: required
+            y, z: optional y, and z coordinates
+        '''
+
+        y = 0 if z is not None and y is None else y
+        coord_text = ','.join(map(str, filter(lambda p: p is not None, (x, y, z))))
+        self.try_command(
+            'language renderscript %s breakpoint set %s -c %s' % (
+                kernel_type, breakpoint_expr, coord_text
+            ),
+            [r'Breakpoint(s) created'],
+            expected_regex=[
+                r'Conditional kernel breakpoint on coordinate.+%d,\s*%d,\s*%d' % (
+                    x or 0, y or 0, z or 0
+                )
+            ]
+        )
+
+    def assert_coord_stop(
+            self, soname, func_name, x, y=None, z=None, stopped=True
+        ):
+        '''Run lldb commands to check that coordinates match expected values.
+
+        Args:
+            (x, y, z): The expected coordinates.
+            soname: The name of the renderscript script module e.g. 'allocs'
+            for librs.allocs.so
+            func_name: String that is the name of the kernel function
+
+        Raises:
+            TestFail: One of the lldb commands did not provide the expected
+                      output.
+        '''
+
+        if stopped:
+            self.try_command(
+                'process continue',
+                expected_regex=[
+                    r'resuming',
+                    r'Process \d+ stopped',
+                    r'stop reason = breakpoint',
+                    r'frame #0: (0x[0-9a-fA-F]+ )?librs.%s.so`%s' % (
+                        soname, func_name)
+                ]
+            )
+        else:
+            self.try_command(
+                'bt',
+                expected_regex=[
+                    'stop reason = breakpoint',
+                    'frame #0:',
+                    'librs.*\.so`%s' % kernel
+                ]
+            )
+
+        self.try_command(
+            'language renderscript kernel coordinate',
+            '(%d, %d, %d)' % (x, y or 0, z or 0)
+        )

diff --git a/tests/lldb/tests/harness/decorators.py b/tests/lldb/tests/harness/decorators.py
new file mode 100644
index 0000000..e4a49b3
--- /dev/null
+++ b/tests/lldb/tests/harness/decorators.py

@@ -0,0 +1,157 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, absolute_import
+
+import functools
+import warnings
+import inspect
+
+
+class skip_conditional(object):
+    '''
+    Test method decorator that marks a test method as ignorable if the given
+    arguments evaluate as Truthy. If the argument is callable, then it is called
+    and the return value is used as the predicate.
+
+    >>> class MyTestClass(TestBase):
+    ...     def test_something(self):
+    ...         pass
+    ...
+    ...     @skip_conditional(not sys.platform.startswith("linux"))
+    ...     def test_some_linux_behaviour(self):
+    ...         assert "vmlinuz" in open("/proc/cmdline").read()
+    ...
+    ...     @skip_conditional(lambda : True):
+    ...     def test_that_never_runs(self):
+    ...         pass
+    '''
+    def __init__(self, skip_condition, message="skipped"):
+        self._skip_condition = skip_condition
+        self._message = message
+
+    def __call__(self, func):
+        @functools.wraps(func)
+        def inner(*args, **kwargs):
+            skip_condition = self._skip_condition
+            if callable(skip_condition):
+                # args[0] is ``self``
+                skip_condition = skip_condition(args[0])
+
+            if skip_condition:
+                print("skipping %r - %s" % (func, self._message))
+                return True
+            return func(args[0])
+
+        return inner
+
+
+class skip_test(skip_conditional):
+    ''''
+    Unconditionally skip a test
+    '''
+    def __init__(self, skip_condition, *args, **kwargs):
+        super(skip_test, self).__init__(True, *args, **kwargs)
+
+
+java_only_test = lambda: skip_conditional(lambda self: not self.app_type == 'java')
+
+cpp_only_test = lambda: skip_conditional(lambda self: not self.app_type == 'cpp')
+
+jni_only_test = lambda: skip_conditional(lambda self: not self.app_type == 'jni')
+
+
+def wimpy(func):
+    '''
+    Mark a test as 'wimpy' that is - a function specifically known to be quick-running.
+    This implementation simply adds the `.wimpy` attribute to the decorated function
+    and returns it, otherwise unmodified
+    '''
+    func.wimpy = True
+
+    return func
+
+
+class ordered_test(object):
+    '''Set the ordered attribute on function'''
+    def __init__(self, order):
+        self._order = order
+
+    def __call__(self, func):
+        func.test_order = self._order
+        return func
+
+
+class deprecated(object):
+    """
+    method or function decorator used to warn of pending feature removal:
+
+    >>> @deprecated()
+    ... def myfunc():
+    ...     return 'hello'
+    ...
+    >>> myfunc()
+        DeprecationWarning: `__main__.myfunc()` is deprecated and will be removed soon.
+    'hello'
+    >>> class MyClass(object):
+    ... @deprecated(alternative_feature='print')
+    ... def myprint(self, *args, **kwargs):
+    ...     print(*args, **kwargs)
+    ...
+    >>> obj = MyClass()
+    >>> obj.myprint("hello")
+    DeprecationWarning: `__main__.MyClass.myfunc()` is deprecated and will be removed soon. Use 'print' instead.
+    hello
+    """
+
+    def __init__(
+            self,
+            alternative_feature=None,
+            removal_date='soon',
+            exception=UserWarning
+        ):
+        self.alternative_feature_message = (
+            alternative_feature and 'use %r instead' % alternative_feature or ''
+        )
+        self.exception = exception
+        self.removal_date = removal_date
+
+    def __call__(self, func):
+        class_name = ''
+        if getattr(func, 'im_class', None):
+            class_name = '%s.' % func.im_class.__name__
+
+        if getattr(func, 'im_func', None):
+            func_name = func.im_func.func_name
+        else:
+            func_name = func.func_name
+
+        module_name = getattr(func, '__module__')
+
+        warning = "`%s.%s%s()` is deprecated and will be removed %s. %s" % (
+            module_name,
+            class_name,
+            func_name,
+            self.removal_date,
+            self.alternative_feature_message
+        )
+
+        @functools.wraps(func)
+        def inner(*args, **kwargs):
+            if not getattr(func, 'deprecation_warned', False):
+                warnings.warn(warning, self.exception, 2)
+                func.deprecation_warned = True
+            return func(*args, **kwargs)
+
+        return inner

diff --git a/tests/lldb/tests/harness/exception.py b/tests/lldb/tests/harness/exception.py
new file mode 100644
index 0000000..2cfc99f
--- /dev/null
+++ b/tests/lldb/tests/harness/exception.py

@@ -0,0 +1,41 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains TestSuiteException.'''
+
+from __future__ import absolute_import
+
+class TestSuiteException(Exception):
+    '''Exception that is thrown whenever an internal error is encountered.
+
+    Just contains a message.
+    '''
+    pass
+
+class DisconnectedException(Exception):
+    '''Exception that is thrown if lldb-server unexpectedly disconnected.
+
+    Just contains a message.
+    '''
+    pass
+
+
+class FailFastException(TestSuiteException):
+    '''Quick Bailout'''
+    pass
+
+
+class TestIgnoredException(TestSuiteException):
+    '''Raised when a testcase is ignored.'''
+    pass

diff --git a/tests/lldb/tests/harness/test_base.py b/tests/lldb/tests/harness/test_base.py
new file mode 100644
index 0000000..9902989
--- /dev/null
+++ b/tests/lldb/tests/harness/test_base.py

@@ -0,0 +1,332 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains TestBase, the base class of all tests.'''
+
+from __future__ import absolute_import
+
+import logging
+import os
+import re
+import tempfile
+import inspect
+import traceback
+
+from .exception import DisconnectedException, TestSuiteException
+
+from . import util_log
+
+
+class TestBase(object):
+    '''Base class for all tests. Provides some common functionality.'''
+
+    bundle_target = {}
+
+    class TestFail(Exception):
+        '''Exception that is thrown when a line in a test fails.
+
+        This exception is thrown if a lldb command does not return the expected
+        string.
+        '''
+        pass
+
+    def __init__(self, device_port, device, timer, app_type, wimpy=False, **kwargs):
+        # Keep argument names for documentation purposes. This method is
+        # overwritten by test_base_remote.
+        # pylint: disable=unused-argument
+        self._lldb = None # handle to the lldb module
+        self._ci = None # instance of the lldb command interpreter for this test
+        self._timer = timer # timer instance, to check whether the test froze
+        self.app_type = app_type # The type of bundle that is being executed
+        self.wimpy = wimpy
+
+    def setup(self, android):
+        '''Set up environment for the test.
+
+        Override to specify commands to be run before the test APK launch.
+        Useful for setting Android properties or environment variables. See also
+        the teardown method.
+
+        Args:
+            android: Handler to the android device, see the UtilAndroid class.
+        '''
+        pass
+
+    def teardown(self, android):
+        '''Clean up environment after test.
+
+        Override this procedure to specify commands to be run after the test has
+        finished. This method is run regardless the outcome of the test.
+
+        Args:
+            android: Handler to the android device, see the UtilAndroid class.
+        '''
+        pass
+
+    def run(self, dbg, remote_pid, lldb):
+        '''Execute the actual test suite.
+
+        Args:
+            dbg: The instance of the SBDebugger that is used to test commands.
+            remote_pid: The integer that is the process id of the binary that
+                        the debugger is attached to.
+            lldb: A handle to the lldb module.
+
+        Returns:
+            A list of (test, failure) tuples.
+        '''
+        log = util_log.get_logger()
+
+        def predicate(obj):
+            '''check whether we're interested in the function'''
+            if not callable(obj):
+                return False
+            if self.wimpy and not getattr(obj, 'wimpy', False):
+                log.debug("skipping non-wimpy test in wimpy mode:%r", obj)
+                return False
+            return True
+
+        test_methods = [
+            method for name, method in inspect.getmembers(self, predicate)
+            if name.startswith('test_')
+        ]
+        log.debug("Found the following tests %r", test_methods)
+        test_errors = []
+
+        for test in sorted(
+            test_methods,
+            key=lambda item: getattr(item, 'test_order', float('Inf'))
+        ):
+            try:
+                log.info("running test %r", test.__name__)
+                result = test()
+            except (self.TestFail, TestSuiteException) as e:
+                test_errors.append((method, e))
+
+        return test_errors
+
+    def post_run(self):
+        '''Clean up after test execution.'''
+        pass
+
+    def assert_true(self, cond):
+        '''Check a given condition and raise TestFail if it is False.
+
+        Args:
+            cond: The boolean condition to check.
+
+        Raises:
+            TestFail: The condition was false.
+        '''
+        if not cond:
+            raise self.TestFail()
+
+    def assert_lang_renderscript(self):
+        '''Check that LLDB is stopped in a RenderScript frame
+
+        Use the LLDB API to check that the language of the current frame
+        is RenderScript, fail otherwise.
+
+        Raises:
+            TestFail: Detected language not RenderScript.
+        '''
+        assert self._lldb
+        assert self._ci
+
+        proc = self._ci.GetProcess()
+        frame = proc.GetSelectedThread().GetSelectedFrame()
+        lang = frame.GetCompileUnit().GetLanguage()
+
+        if lang != self._lldb.eLanguageTypeExtRenderScript:
+            raise self.TestFail('Frame language not RenderScript, instead {0}'
+                                .format(lang))
+
+    def do_command(self, cmd):
+        '''Run an lldb command and return the output.
+
+        Args:
+            cmd: The string representing the lldb command to run.
+
+        Raises:
+            TestFail: The lldb command failed.
+        '''
+        assert self._lldb
+        assert self._ci
+
+        log = util_log.get_logger()
+        res = self._lldb.SBCommandReturnObject()
+
+        log.info('[Command] {0}'.format(cmd))
+
+        # before issuing the command, restart the current timer to check
+        # whether the command is going to freeze the test
+        if self._timer:
+            self._timer.reset()
+
+        self._ci.HandleCommand(cmd, res)
+
+        if not res.Succeeded():
+            error = res.GetError()
+            error = error if error else res.GetOutput()
+            raise self.TestFail('The command "{0}" failed with the error: {1}'
+                                .format(cmd, error if error else '<N/a>'))
+
+        output = res.GetOutput() or ''
+        log.debug('[Output] {0}'.format(output.rstrip()))
+
+        return output
+
+    def try_command(self, cmd, expected=None, expected_regex=None):
+        '''Run an lldb command and match the expected response.
+
+        Args:
+            cmd: The string representing the lldb command to run.
+            expected: A list of strings that should be present in lldb's
+                      output.
+            expected_regex: A list of regular expressions that should
+                            match lldb's output.
+
+        Raises:
+            TestFail: One of the expected strings were not found in the lldb
+            output.
+
+        Returns:
+            str: raw lldb command output.
+        '''
+        assert self._lldb
+        assert self._ci
+        log = util_log.get_logger()
+        output = ''
+        try:
+            output = self.do_command(cmd)
+
+            if 'lost connection' in output:
+                raise DisconnectedException('Lost connection to lldb-server.')
+
+            # check the expected strings
+            if expected:
+                self._match_literals(output, expected)
+
+            # check the regexp patterns
+            if expected_regex:
+                self._match_regexp_patterns(output, expected_regex)
+
+        except self.TestFail as exception:
+            # if the command failed, ensure the output retrieved from the
+            # command is printed even in verbose mode
+            if log.getEffectiveLevel() > logging.DEBUG:
+                log.error('[Output] {0}'.format(output.rstrip() if output
+                                                else '<empty>'))
+
+            # print the back trace, it should help to identify the error in
+            # the test
+            backtrace = ['[Back trace]']
+            for (filename, line, function, text) in \
+                    traceback.extract_stack()[:-1]:
+                backtrace.append('  [{0} line: {2} fn: {1}] {3}'.format(
+                            filename, function, line, text
+                    )
+                )
+            log.error('\n'.join(backtrace))
+            log.error('[TEST ERROR] {0}'.format(exception.message))
+            raise  # pass through
+
+        return output
+
+    def _match_literals(self, text, literals):
+        '''Checks the text against the array of literals.
+
+        Raises a TestFail exception in case one of the literals is not contained
+        in the text.
+
+        Args:
+            text: String, it represents the text to match.
+            literals: an array of string literals to match in the output.
+
+        Throws: self.TestFail: if it cannot match one of the literals in
+                the output.
+        '''
+        for string in literals:
+            if string not in text:
+                raise self.TestFail('Cannot find "{0}" in the output'
+                                    .format(string))
+
+    def _match_regexp_patterns(self, text, patterns):
+        '''Checks the text against the array of regular expression patterns.
+
+        Raises a TestFail exception in case one of the patterns is not matched
+        in the given text.
+
+        Args:
+            text: String, it represents the text to match.
+            patterns: an array of strings, each of them representing a regular
+                      expression to match in text.
+
+        Throws: self.TestFail: if it cannot match one of the literals in
+                the output.
+        '''
+        log = util_log.get_logger()
+
+        for regex in patterns:
+            match = re.search(regex, text)
+            if not match:
+                raise self.TestFail('Cannot match the regexp "{0}" in '
+                                    'the output'.format(regex))
+            else:
+                msg = 'Found match to regex {0}: {1}'.format(regex,
+                                     match.group())
+                log.debug(msg)
+
+    @staticmethod
+    def get_tmp_file_path():
+        '''Get the path of a temporary file that is then deleted.
+
+        Returns:
+            A string that is the path to a temporary file.
+        '''
+        file_desc, name = tempfile.mkstemp()
+        os.close(file_desc)
+        os.remove(name)
+        return name
+
+
+class TestBaseNoTargetProcess(TestBase):
+    '''lldb target that doesn't require a binary to be running.'''
+
+    def get_bundle_target(self):
+        '''Get bundle executable to run.
+
+        Returns: None
+        '''
+        return None
+
+    @property
+    def bundle_target(self):
+        return self.get_bundle_target()
+
+    def run(self, dbg, remote_pid, lldb):
+        '''Execute the test case.
+
+        Args:
+            dbg: The instance of the SBDebugger that is used to test commands.
+            lldb: A handle to the lldb module.
+
+        Returns:
+            True: test passed, False: test failed.
+        '''
+        self._lldb = lldb
+        self._dbg = dbg
+        self._ci = dbg.GetCommandInterpreter()
+        assert self._ci.IsValid()
+        return super(TestBaseNoTargetProcess, self).run(self, dbg, remote_pid)

diff --git a/tests/lldb/tests/harness/test_base_remote.py b/tests/lldb/tests/harness/test_base_remote.py
new file mode 100644
index 0000000..c430edf
--- /dev/null
+++ b/tests/lldb/tests/harness/test_base_remote.py

@@ -0,0 +1,172 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the base class TestBaseRemote'''
+
+from __future__ import absolute_import
+
+import os
+import re
+
+from .test_base import TestBase
+from . import util_log
+
+
+class TestBaseRemote(TestBase):
+    '''Base class for all tests that connect to a remote device.
+
+    Provides common functionality to set up the connection and tear it down.
+    '''
+
+    def __init__(self, device_port, device, timer, *args, **kwargs):
+        super(TestBaseRemote, self).__init__(device_port, device, timer, *args, **kwargs)
+        # port used by lldb-server on the device.
+        self._device_port = device_port
+        self._platform = None
+        # id of the device that adb will communicate with.
+        self._device = device
+
+    def set_src_map(self, file_name, new_src_path):
+        '''Call lldb to set the source mapping of a given file.
+
+        Set lldb's source mapping of a given file to a given path. This can be
+        used to make the test suite independent of where an APK was compiled.
+
+        Args:
+            file_name: String, which is the name of the file whose mapping is
+                to be changed
+            new_src_path: String which is the new absolute path to the source
+                file.
+        '''
+        line_table = self.do_command('target modules dump line-table '
+                                     + file_name)
+
+        lines = line_table.split('\n')
+        if 'Line table for' not in lines[0]:
+            raise self.TestFail('Could not determine source path of '
+                                + file_name)
+
+        # Expecting output like:
+        # (lldb) target modules dump line-table scalars.rs
+        # Line table for /home/jenkins/workspace/grd-aosp-parameterised-build/
+        # merge_151216/frameworks/rs/tests/lldb/java/BranchingFunCalls/src/rs/
+        # frameworks/rs/tests/lldb/java/BranchingFunCalls/src/rs/scalars.rs in
+        # `librs.scalars.so
+        # 0xb30f2374: /home/jenkins/workspace/grd-aosp-parameterised-build/
+        # merge_151216/frameworks/rs/tests/lldb/java/BranchingFunCalls/src/rs/
+        # scalars.rs:46
+        # ...
+        # For some reason the first line contains a mangled path?
+        old_path = re.findall(r"[^ :]+", lines[1])[1]
+        old_dir = os.path.dirname(old_path)
+
+        self.try_command('settings set target.source-map %s %s'
+                         % (old_dir, new_src_path), [''])
+
+    def post_run(self):
+        '''Clean up after execution.'''
+        if self._platform:
+            self._platform.DisconnectRemote()
+
+    def _connect_to_platform(self, lldb_module, dbg, remote_pid):
+        '''Connect to an lldb platform that has been started elsewhere.
+
+        Args:
+            lldb_module: A handle to the lldb module.
+            dbg: The instance of the SBDebugger that should connect to the
+                 server.
+            remote_pid: The integer that is the process id of the binary that
+                        the debugger should attach to.
+
+        Returns:
+            True if the debugger successfully attached to the server and
+            process.
+        '''
+        # pylint: disable=too-many-return-statements
+        remote_pid = str(remote_pid)
+
+        log = util_log.get_logger()
+
+        err1 = dbg.SetCurrentPlatform('remote-android')
+        if err1.Fail():
+            log.fatal(err1.GetCString())
+            return False
+
+        self._platform = dbg.GetSelectedPlatform()
+        if not self._platform:
+            return False
+
+        connect_string = \
+            'adb://{0}:{1}'.format(self._device, self._device_port)
+        opts = lldb_module.SBPlatformConnectOptions(connect_string)
+
+        for _ in range(2):
+            err2 = self._platform.ConnectRemote(opts)
+            if err2.Fail():
+                log.error(err2.GetCString())
+
+                if 'Connection refused' in err2.GetCString():
+                    log.warning('Connection to lldb server was refused. '
+                                'Trying again.')
+                else:
+                    # Unknown error. Don't try again.
+                    return False
+            else:
+                # Success
+                break
+        else:
+            log.fatal('Not trying again, maximum retries exceeded.')
+            return False
+
+        target = dbg.CreateTarget(None)
+        if not target:
+            return False
+
+        dbg.SetSelectedTarget(target)
+        listener = lldb_module.SBListener()
+        err3 = lldb_module.SBError()
+        process = target.AttachToProcessWithID(listener, int(remote_pid), err3)
+        if err3.Fail() or not process:
+            log.fatal(err3.GetCString())
+            return False
+
+        return True
+
+    def run(self, dbg, remote_pid, lldb):
+        '''Execute the actual testsuite.
+
+        Args:
+            dbg: The instance of the SBDebugger that is used to test commands.
+            remote_pid: The integer that is the process id of the binary that
+                        the debugger is attached to.
+            lldb: A handle to the lldb module.
+
+        Returns: list of (test, failure) tuples.
+
+        '''
+        assert dbg
+        assert remote_pid
+        assert lldb
+
+        self._lldb = lldb
+
+        self.assert_true(self._connect_to_platform(lldb, dbg, remote_pid))
+        self._ci = dbg.GetCommandInterpreter()
+        assert self._ci
+
+        self.assert_true(self._ci.IsValid())
+        self.assert_true(self._ci.HasCommands())
+
+        return super(TestBaseRemote, self).run(dbg, remote_pid, lldb)
+

diff --git a/tests/lldb/tests/harness/util_android.py b/tests/lldb/tests/harness/util_android.py
new file mode 100644
index 0000000..a0cf700
--- /dev/null
+++ b/tests/lldb/tests/harness/util_android.py

@@ -0,0 +1,736 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the class UtilAndroid, providing utility method to
+interface with Android ADB.'''
+
+from __future__ import absolute_import
+
+import logging
+import re
+import subprocess
+import time
+import collections
+import multiprocessing
+try:
+    # Python 3
+    import queue
+except ImportError:
+    import Queue as queue
+
+from .exception import TestSuiteException
+from . import util_log
+
+
+class UtilAndroid(object):
+    '''Provides some utility methods that interface with Android using adb.'''
+    # pylint: disable=too-many-public-methods
+
+    def __init__(self, adb_path, lldb_server_path_device, device):
+        # The path to the adb binary on the local machine
+        self._path_adb = adb_path
+        # The path to the lldb server binary on the device
+        self._path_lldbserver = lldb_server_path_device
+        self._log = util_log.get_logger()
+        self.device = device
+        self._prop_stacks = collections.defaultdict(list)
+        return
+
+    @staticmethod
+    def _validate_string(string):
+        '''Check that a string is valid and not empty.
+
+        Args:
+            string: The string to be checked.
+        '''
+        assert isinstance(string, str)
+        assert len(string) > 0
+
+    def adb(self, args, async=False, device=True, timeout=None):
+        '''Run an adb command (async optional).
+
+        Args:
+            args: The command (including arguments) to run in adb.
+            async: Boolean to specify whether adb should run the command
+                   asynchronously.
+            device: boolean to specify whether the serial id of the android
+                    device should be inserted in the adb command.
+            timeout: it specifies the number of seconds to wait for
+                     a synchronous invocation before aborting. If unspecified or
+                     None it waits indefinitely for the command to complete.
+
+        Raises:
+            ValueError: it can be caused by any of the following situations:
+                        - when both the combination async=True and timeout are
+                          given.
+                        - when a timeout <= 0 is specified.
+
+        Returns:
+            If adb was synchronously run and the command completed by the
+            specified timeout, a string which is the output (standard out and
+            error) from adb. Otherwise it returns None.
+        '''
+
+        # Form the command
+        if device:
+            cmd = '{0} -s {1} {2}'.format(self._path_adb, self.device, args)
+        else:
+            cmd = '{0} {1}'.format(self._path_adb, args)
+
+        self._log.debug('Execute ADB: %s', cmd)
+
+        if timeout is None:
+            # local invocation
+            return_code, output = UtilAndroid._execute_command_local(cmd, async)
+
+        else:
+            # remote invocation
+            if async:
+                raise ValueError('Invalid combination: asynchronous invocation '
+                                 'with timeout specified')
+
+            return_code, output = UtilAndroid._execute_command_remote(cmd,
+                                                                      timeout)
+
+            if return_code is None:
+                self._log.warn('[ADB] The command timed out: %s', cmd)
+
+        # log the output message
+        if output is not None:
+            self._adb_log_output(cmd, output, return_code)
+
+        return output
+
+    def adb_retry(self, args, max_num_attempts, timeout):
+        '''Attempt to execute the given adb command a certain number of times.
+
+        The function executes the given command through adb, waiting for its
+        completion up to 'timeout' seconds. If the command completes then it
+        returns its output. Otherwise it aborts the execution of the adb
+        command and re-issues it anew with the same parameters. In case of
+        timeout this process is repeated up to 'max_num_attempts'.
+
+        The purpose of this function is to handle the cases when, for some
+        reason, a command sent to 'adb' freezes, blocking the whole test suite
+        indefinitely.
+
+        Args:
+            args: The command (including arguments) to run in adb.
+            max_num_attempts: the max number of attempts to repeat the command
+                              in case of timeout.
+            timeout: it specifies the number of seconds to wait for the adb
+                     command to complete.
+
+        Raises:
+            ValueError: when the parameter timeout is invalid (None or <= 0).
+
+        Returns:
+            If adb was synchronously run and the command completes by the
+            specified timeout, a string which is the output (standard out and
+            error) from adb. Otherwise it returns None.
+        '''
+        if timeout is None or timeout <= 0:
+            raise ValueError('Invalid value for timeout')
+
+        output = None
+
+        for attempt in range(max_num_attempts):
+            self._log.debug('[ADB] Attempt #%d: %s', attempt + 1, args)
+            output = self.adb(args, False, True, timeout)
+            if output:
+                break
+
+        return output
+
+    def _adb_log_output(self, cmd, output, return_code):
+        '''Save in the log the command & output from `adb`.
+
+        Internal function, helper to record in the log the issued adb command
+        together with its output and return code.
+
+        Params:
+            cmd: string, the command issued to `adb`.
+            output: string, the output retrieved from `adb`.
+            return_code: int, the return code from `adb`.
+        '''
+
+        message = output.strip()
+
+        # if return_code != 0, we wish to also record the command executed
+        # (which occurs if and only if we are in verbose mode)
+        is_warning = return_code != 0
+        threshold = self._log.getEffectiveLevel()
+        if is_warning and threshold > logging.DEBUG:
+            self._log.warn("[ADB] Command executed: {0}".format(cmd))
+
+        level = logging.WARNING if is_warning else logging.DEBUG
+        if message:
+            # if message is composed by multiple lines, then print it after
+            # the log preamble
+            if re.search('\n', message):
+                message = '\n' + message
+        else:
+            message = '<empty>'
+
+        self._log.log(level, 'RC: {0}, Output: {1}'.format(return_code,
+                                                           message))
+
+    def check_adb_alive(self):
+        '''Ping the device and raise an exception in case of timeout.
+
+        It sends a ping message through 'adb shell'. The emulator/device should
+        echo the same message back by one minute. If it does not, it raises
+        a TestSuiteException.
+
+        Purpose of this method is to check whether 'adb' became frozen or
+        stuck.
+
+        Raises:
+            TestSuiteException: in case the device/emulator does not reply by
+                                one minute or the `ping' message is not echoed
+                                back.
+        '''
+        token = 'PING'
+        log = util_log.get_logger()
+        cmd = "echo {0}".format(token)
+
+        tries = 10
+        try_number = tries
+        while try_number > 0:
+            log.debug('Sending a ping through "adb shell" (try #%s)...',
+                      try_number)
+            output = self.shell(cmd, False, 60)
+
+            if output is None:
+                raise TestSuiteException(
+                    'Timeout when pinging the device/emulator through '
+                    '"adb shell".  Is "adb" stuck or dead?')
+            elif token not in output:
+                log.debug('Ping failed. Cannot match the token "%s" in "adb '
+                          'shell %s"', token, cmd)
+            else:
+                log.debug('Pong message received')
+                return
+
+            try_number -= 1
+            time.sleep(5)
+
+        raise TestSuiteException('Cannot ping the device/emulator through '
+                                 '"adb shell". Tried %s times. Is "adb" stuck '
+                                 'or dead?' % tries)
+
+    def shell(self, cmd, async=False, timeout=None):
+        '''Run a command via the adb shell.
+
+        Args:
+            cmd: The command (including arguments) to run in the adb shell.
+            async: Boolean to specify whether adb should run the command
+                   asynchronously.
+            timeout: it specifies the number of seconds to wait for
+                     a synchronous invocation before aborting. If unspecified or
+                     None it waits indefinitely for the command to complete
+
+        Returns:
+            If adb was synchronously run, a string which is the output (standard
+            out and error) from adb. Otherwise None.
+        '''
+        return self.adb('shell "{0}"'.format(cmd), async, True, timeout)
+
+    def find_app_pid(self, process_name):
+        '''Find the process ID of a process with a given name.
+
+        If more than one instance of the process is running return the first pid
+        it finds.
+
+        Args:
+            process_name: A string representing the name of the package or
+                          binary for which the id should be found. I.e. the
+                          string or part of the string that shows up in the "ps"
+                          command.
+
+        Returns:
+            An integer representing the id of the process, or None if it was not
+            found.
+        '''
+        self._validate_string(process_name)
+
+        pid_output = self.shell('pidof ' + process_name)
+        pid_output = re.sub(r'\*.+\*', '', pid_output)
+        pids = pid_output.split()
+
+        if len(pids) < 1:
+            self._log.warn('Unable to find pid of: {0}'.format(process_name))
+            return None
+
+        if len(pids) > 1:
+            self._log.warn('Found multiple instances of {0} running: {1}'
+                           .format(process_name, pids))
+
+        try:
+            pid = int(pids[0])
+            self._log.info('App pid found: {0}'.format(pids[0]))
+            return pid
+        except ValueError:
+            return None
+
+    def adb_root(self):
+        '''Set adb to be in root mode.'''
+        self.adb('root')
+
+    def _adb_remount(self):
+        '''Remount the filesystem of the device.'''
+        self.adb('remount')
+
+    def validate_adb(self):
+        '''Validate adb that it can be run.
+
+        Raises:
+            TestSuiteException: Unable to validate that adb exists and runs
+                                successfully.
+        '''
+        out = self.adb('version', False, False)
+        if out and 'Android' in out and 'version' in out:
+            self._log.info('adb found: {0}'.format(out))
+            return None
+        raise TestSuiteException('unable to validate adb')
+
+    def is_booted(self):
+        ''' Check if the device/emulator has finished booting.
+
+        Returns: True if the property sys.boot_completed is true, False
+                 otherwise.
+        '''
+        return self._get_prop('sys.boot_completed').strip() == '1'
+
+    def validate_device(self, check_boot=True, device_substring=''):
+        '''Validate that there is at least one device.
+
+        Args:
+            check_boot: Boolean to specify whether to check whether the device
+                        has finished booting as well as being present.
+            device_substring: String that needs to be part of the name of the
+                              device.
+
+        Raises:
+            TestSuiteException: There was a failure to run adb to list the
+                                devices or there is no device connected or
+                                multiple devices connected without the user
+                                having specified the device to use.
+        '''
+
+        out = self.adb('devices', False, False)
+        if not 'List of devices attached' in out:
+            raise TestSuiteException('Unable to list devices')
+
+        lines = out.split('\n')
+        found_device = False # True if the specified device is found
+        devices = []
+
+        for line in lines[1:]:
+            if '\tdevice' in line and device_substring in line:
+                device = line.split()[0]
+                devices.append(device)
+                if self.device:
+                    if self.device == device:
+                        found_device = True
+
+        if len(devices) == 0:
+            raise TestSuiteException('adb is unable to find a connected '
+                                     'device/emulator to test.')
+
+        if not self.device:
+            if len(devices) == 1:
+                self.device = devices[0]
+            else:
+                raise TestSuiteException('Multiple devices connected,'
+                                         'specify -d device id.')
+        else:
+            if not found_device:
+                raise TestSuiteException('Couldn\'t find the device {0} that '
+                                         'was specified, please check -d '
+                                         'argument'.format(self.device))
+
+        if check_boot and not self.is_booted():
+            raise TestSuiteException(
+                'The device {0} has not yet finished booting.'
+                .format(self.device))
+
+    def device_with_substring_exists(self, device_substring):
+        '''Check whether a device exists whose name contains a given string.
+
+        Args:
+            device_substring: String that is part of the name of the device to
+                              look for.
+
+        Raises:
+            TestSuiteException: There was a failure to run adb to list the
+                                devices.
+        '''
+        out = self.adb('devices', False, False)
+        if not 'List of devices attached' in out:
+            raise TestSuiteException('Unable to list devices')
+
+        lines = out.split('\n')
+
+        for line in lines[1:]:
+            if '\tdevice' in line:
+                device = line.split()[0]
+                if device.find(device_substring) != -1:
+                    return True
+
+        return False
+
+    def get_device_id(self):
+        '''Return ID of the device that will be used for running the tests on.
+
+        Returns:
+            String representing device ID.
+        '''
+        return self.device
+
+    def _kill_pid(self, pid):
+        '''Kill a process identified by its pid by issuing a "kill" command.
+
+        Args:
+            pid: The integer that is the process id of the process to be killed.
+        '''
+        self.shell('kill -9 ' + str(pid))
+
+    def stop_app(self, package_name):
+        '''Terminate an app by calling am force-stop.
+
+        Args:
+            package_name: The string representing the name of the package of the
+                          app that is to be stopped.
+        '''
+        self._validate_string(package_name)
+        self.shell('am force-stop ' + package_name)
+
+    def kill_process(self, name):
+        '''Kill a process identified by its name (package name in case of apk).
+
+        Issues the "kill" command.
+
+        Args:
+            name: The string representing the name of the binary of the process
+                  that is to be killed.
+
+        Returns:
+            True if the kill command was executed, False if it could not be
+            found.
+        '''
+        pid = self.find_app_pid(name)
+        if pid:
+            self._kill_pid(pid)
+            return True
+        return False
+
+    def kill_all_processes(self, name):
+        '''Repeatedly try to call "kill" on a process to ensure it is gone.
+
+        If the process is still there after 5 attempts reboot the device.
+
+        Args:
+            name: The string representing the name of the binary of the process
+                  that is to be killed.
+
+        Raises:
+            TestSuiteException: If the process could not be killed after 5
+                                attempts and the device then failed to boot
+                                after rebooting.
+        '''
+
+        # try 5 times to kill this process
+        for _ in range(1, 5):
+            if not self.kill_process(name):
+                return
+        # stalled process must reboot
+        self._reboot_device()
+
+    def kill_servers(self):
+        '''Kill all gdbserver and lldb-server instances.
+
+        Raises:
+            TestSuiteException: If gdbserver or lldb-server could not be killed
+                                after 5 attempts and the device then failed to
+                                boot after rebooting.
+        '''
+        self.kill_all_processes('gdbserver')
+        self.kill_all_processes('lldb-server')
+
+    def launch_elf(self, binary_name):
+        '''Launch a binary (compiled with the NDK).
+
+        Args:
+            binary_name: The string representing the name of the binary that is
+                         to be launched.
+
+        Returns:
+            Boolean, failure if the app is not installed, success otherwise.
+        '''
+        # Ensure the apk is actually installed.
+        output = self.shell('ls /data/ | grep ' + binary_name)
+        if binary_name not in output:
+            return False
+
+        stdout = self.shell('exec /data/' + binary_name, True)
+        self._log.info(str(stdout))
+
+        return True
+
+    def wait_for_device(self):
+        '''Ask ADB to wait for a device to become ready.'''
+        self.adb('wait-for-device')
+
+    def _reboot_device(self):
+        '''Reboot the remote device.
+
+        Raises:
+            TestSuiteException: If the device failed to boot after rebooting.
+        '''
+        self.adb('reboot')
+        self.wait_for_device()
+        # Allow 20  mins boot time to give emulators such as MIPS enough time
+        sleeping_countdown = 60*20
+        while not self.is_booted():
+            time.sleep(1)
+            sleeping_countdown -= 1
+            if sleeping_countdown == 0:
+                raise TestSuiteException('Failed to reboot. Terminating.')
+
+        self.adb_root()
+        self.wait_for_device()
+        self._adb_remount()
+        self.wait_for_device()
+
+    def launch_app(self, name, activity):
+        '''Launch a Renderscript application.
+
+        Args:
+            name: The string representing the name of the app that is to be
+                  launched.
+            activity: The string representing the activity of the app that is to
+                      be started.
+
+        Returns:
+            Boolean, failure if the apk is not installed, success otherwise.
+        '''
+        assert name and activity
+
+        # Ensure the apk is actually installed.
+        output = self.shell('pm list packages ' + name)
+        if not output:
+            return False
+
+        cmd = 'am start -S -W {0}/{0}.{1}'.format(name, activity)
+        stdout = self.shell(cmd)
+
+        self._log.info(str(stdout))
+
+        return True
+
+    def launch_lldb_platform(self, port):
+        '''Launch lldb server and attach to target app.
+
+        Args:
+            port: The integer that is the port on which lldb should listen.
+        '''
+        cmd = "export LLDB_DEBUGSERVER_PATH='{0}';{0} p --listen *:{1}"\
+            .format(self._path_lldbserver, port)
+        self.shell(cmd, True)
+        time.sleep(5)
+
+    def forward_port(self, local, remote):
+        '''Use adb to forward a device port onto the local machine.
+
+        Args:
+            local: The integer that is the local port to forward.
+            remote: The integer that is the remote port to which to forward.
+        '''
+        cmd = 'forward tcp:%s tcp:%s' % (str(local), str(remote))
+        self.adb(cmd)
+
+    def remove_port_forwarding(self):
+        '''Remove all of the forward socket connections open in adb.
+
+        Avoids a windows adb error where we can't bind to a listener
+        because too many files are open.
+        '''
+        self.adb('forward --remove-all')
+
+    def _get_prop(self, name):
+        '''Get the value of an Android system property.
+
+        Args:
+            name: Name of the property of interest [string].
+
+        Returns:
+            Current value of the property [string].
+        '''
+        return self.shell('getprop %s' % str(name))
+
+    def _set_prop(self, name, value):
+        '''Set the value of an Android system property.
+
+        Args:
+            name: Name of the property of interest [string].
+            value: Desired new value for the property [string or integer].
+        '''
+        self.shell("setprop %s '%s'" % (str(name), str(value)))
+
+    def push_prop(self, name, new_value):
+        '''Save the value of an Android system property and set a new value.
+
+        Saves the old value onto a stack so it can be restored later.
+
+        Args:
+            name: Name of the property of interest [string].
+            new_value: Desired new value for the property [string or integer].
+        '''
+        old_value = self._get_prop(name)
+        self._set_prop(name, new_value)
+        self._prop_stacks[name].append(old_value.strip())
+
+    def pop_prop(self, name):
+        '''Restore the value of an Android system property previously set by
+        push_prop.
+
+        Args:
+            name: Name of the property of interest [string].
+
+        Returns:
+            Current value of the property [string].
+        '''
+        old_value = self._prop_stacks[name].pop()
+        self._set_prop(name, old_value)
+
+    def reset_all_props(self):
+        '''Restore all the android properties to the state before the first push
+
+        This is equivalent to popping each property the number of times it has
+        been pushed.
+        '''
+        for name in self._prop_stacks:
+            if self._prop_stacks[name] != []:
+                self._set_prop(name, self._prop_stacks[name][0])
+                self._prop_stacks[name] = []
+
+    def make_device_writeable(self):
+        ''' Ensure the device is full writable, in particular the system folder.
+
+        This disables verity and remounts.
+        '''
+        output = self.adb('disable-verity')
+
+        # if the remote is an emulator do not even try to reboot
+        # otherwise check whether a reboot is advised
+        if (self._get_prop('ro.boot.qemu') != '1' and output and
+                'Now reboot your device for settings to take effect' in output):
+            self._reboot_device()
+
+        self._adb_remount()
+        self.wait_for_device()
+        self.adb_root()
+        self.wait_for_device()
+
+    @staticmethod
+    def _execute_command_local(command, async=False):
+        '''Execute the given shell command in the same process.
+
+        Args:
+            command: String, the command to execute
+            async: Boolean to specify whether adb should run the command
+                   asynchronously.
+
+        Returns:
+            if async == False, it returns a tuple with the return code and
+            the output from the executed command. Otherwise the tuple
+            (None, None).
+        '''
+        proc = subprocess.Popen(command,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.STDOUT,
+                                shell=True)
+        if async:
+            return None, None
+
+        # read the whole output from the command
+        with proc.stdout as file_proc:
+            output = ''.join(line for line in file_proc)
+
+        # release the process state
+        proc.terminate()
+        return_code = proc.wait()
+
+        return return_code, output
+
+    @staticmethod
+    def _execute_command_remote(command, timeout):
+        '''Execute the given shell command remotely, in a separate process.
+
+        It spawns an ad hoc process to execute the given command. It waits up
+        to timeout for the command to complete, otherwise it aborts the
+        execution and returns None.
+
+        Args:
+            command: String, the command to execute.
+            timeout: the number of seconds to wait for the command to complete.
+
+        Returns:
+            a pair with the return code and the output from the command, if it
+            completed by the specified 'timeout' seconds. Otherwise the tuple
+            (None, None).
+        '''
+
+        channel = multiprocessing.Queue()
+        proc = multiprocessing.Process(
+            target=_handle_remote_request,
+            name="Executor of `{0}'".format(command),
+            args=(command, channel)
+        )
+
+        # execute the command
+        proc.start()
+        return_code = None
+        output = None
+
+        # wait for the result
+        try:
+            return_code, output = channel.get(True, timeout)
+        except queue.Empty:
+            # timeout hit, the remote process has not fulfilled our request by
+            # the given time. We are going to return <None, None>, nothing to
+            # do here as it already holds return_code = output = None.
+            pass
+
+        # terminate the helper process
+        proc.terminate()
+
+        return return_code, output
+
+
+def _handle_remote_request(command, channel):
+    '''Entry point for the remote process.
+
+    It executes the given command and reports the result into the channel.
+    This function is supposed to be only called by
+    UtilAndroid._execute_command_remote to handle the inter-process
+    communication.
+
+    Args:
+        command: the command to execute.
+        channel: the channel to communicate with the caller process.
+    '''
+    channel.put(UtilAndroid._execute_command_local(command))
+

diff --git a/tests/lldb/tests/harness/util_bundle.py b/tests/lldb/tests/harness/util_bundle.py
new file mode 100644
index 0000000..68954cb
--- /dev/null
+++ b/tests/lldb/tests/harness/util_bundle.py

@@ -0,0 +1,369 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the class UtilBundle, representing a collection of RS
+binaries.'''
+
+from __future__ import absolute_import
+
+import os
+import time
+from . import util_constants
+from . import util_log
+from .exception import TestSuiteException
+
+
+class UtilBundle(object):
+    '''Represents the collection of RS binaries that are debugged.'''
+
+    # Map of binary name to package name of all Java apps debugged
+    _tests_apk = {
+        'JavaInfiniteLoop': 'com.android.rs.infiniteloop',
+        'JavaDebugWaitAttach': 'com.android.rs.waitattachdebug',
+        'JavaNoDebugWaitAttach': 'com.android.rs.waitattachnodebug',
+        'BranchingFunCalls': 'com.android.rs.branchingfuncalls',
+        'KernelVariables': 'com.android.rs.kernelvariables',
+        'Allocations': 'com.android.rs.allocations',
+        'MultipleRSFiles': 'com.android.rs.multiplersfiles',
+        'SingleSource': 'com.android.rs.singlesource',
+        'ScriptGroup': 'com.android.rs.scriptgroup',
+        'Reduction': 'com.android.rs.lldbreductiontest',
+    }
+
+    _tests_jni = {
+        'JNIInfiniteLoop': 'com.android.rs.jniinfiniteloop',
+        'JNIDebugWaitAttach': 'com.android.rs.jnidebugwaitattach',
+        'JNINoDebugWaitAttach': 'com.android.rs.jninodebugwaitattach',
+        'JNIBranchingFunCalls': 'com.android.rs.jnibranchingfuncalls',
+        'JNIKernelVariables': 'com.android.rs.jnikernelvariables',
+        'JNIAllocations': 'com.android.rs.jniallocations',
+        'JNIMultipleRSFiles': 'com.android.rs.jnimultiplersfiles'
+    }
+
+    _tests_ndk = {'CppInfiniteLoop', 'CppNoDebugWaitAttach',
+                  'CppDebugWaitAttach', 'CppBranchingFunCalls',
+                  'CppKernelVariables', 'CppAllocations', 'CppMultipleRSFiles'}
+
+    _missing_path_msg = (
+        'No product path has been provided. If using `lunch` ensure '
+        'the `ANDROID_PRODUCT_OUT` environment variable has been set correctly. '
+        'Alternatively, include it in the config file or specify it explicitly '
+        'on the command line (`--aosp-product-path`)'
+    )
+
+    def __init__(self, android, aosp_product_path):
+        assert android
+        self._android = android # Link to the android module
+        self._aosp_product_path = aosp_product_path
+        self._log = util_log.get_logger()
+
+    def is_apk(self, name):
+        '''Checks if a binary of a given name is an apk.
+
+        Checks whether the name of the apk is in the dictionary of apks.
+
+        Args:
+            name: The string that is the name of the binary to check.
+
+        Returns:
+            True if the binary is an apk, False if it is not.
+
+        Raises:
+            TestSuiteException: The string does not match any item in the list
+            of APK or NDK binaries.
+        '''
+        if name in self._tests_apk:
+            return True
+        if name not in self._tests_ndk and name not in self._tests_jni:
+            raise TestSuiteException('test not apk or ndk')
+        return False
+
+    def uninstall_all(self):
+        '''Uninstall/Delete all the testsuite's apks and binaries on the device.
+
+        Raises:
+            TestSuiteException: One or more apks could not be uninstalled.
+        '''
+        self.uninstall_all_apk()
+        self._delete_all_ndk()
+        self._uninstall_all_jni()
+
+    def uninstall_all_apk(self):
+        '''Uninstall all apks used by the test suite from the device.
+
+        Raises:
+            TestSuiteException: An apk could not be uninstalled.
+        '''
+        max_num_attempts = 3
+        timeout = 180
+
+        for app, package in self._tests_apk.items():
+            self._log.info('Uninstalling the application: %s', app)
+            output = self._android.adb_retry('uninstall ' + package,
+                                             max_num_attempts, timeout)
+
+            if output is None:
+                raise TestSuiteException('Repeated timeouts when uninstalling '
+                                         'the application: ' + app)
+            elif 'Success' not in output:
+                outmsg = '\n' + output.rstrip() if output else '<empty>'
+                self._log.error('Cannot match the string "Success" in the '
+                                'output: %s', outmsg)
+                raise TestSuiteException('Unable to uninstall app ' + app)
+            else:
+                self._log.debug('Application uninstalled: %r', app)
+
+            if 'Success' not in output:
+                self._log.warning('unable to uninstall app ' + app)
+
+    def _uninstall_all_jni(self):
+        '''Uninstall all apks used by the test suite from the device.
+
+        Raises:
+            TestSuiteException: An apk could not be uninstalled.
+        '''
+        for app, package in self._tests_jni.items():
+            output = self._android.adb('uninstall ' + package)
+
+            if 'Success' not in output:
+                raise TestSuiteException('unable to uninstall app ' + app)
+
+    def _delete_all_ndk(self):
+        '''Delete all ndk binaries that were pushed to the device.
+
+        Raises:
+            TestSuiteException: A binary could not be deleted from the device.
+        '''
+        for app in self._tests_ndk:
+            output = self._android.shell('rm /data/' + app)
+            if 'No such file or directory' in output:
+                self._log.warning('unable to uninstall app ' + app)
+
+
+    def push_all(self):
+        '''Push all apk and ndk binaries required by the testsuite to the device
+
+        Raises:
+            TestSuiteException: One or more apks could not be installed or
+                                previously running processes thereof could not
+                                be killed.
+        '''
+        self._push_all_java()
+        self._push_all_ndk()
+        self._push_all_jni()
+
+    def _install_apk(self, app, package):
+        '''Push an apk files to the device.
+
+        This involves uninstalling any old installation and installing again.
+
+        Args:
+            app: A string that is the name of the apk.
+            package: A string that is the name of the package of the apk.
+
+        Raises:
+            TestSuiteException: The apk could not be installed.
+        '''
+        self._log.info('pushing {0}'.format(app))
+
+        self._android.stop_app(package)
+
+        self._android.adb('uninstall ' + package)
+        # Ignore the output of uninstall.
+        # The app may not have been installed in the first place. That's ok.
+
+        flags = ''
+
+        product_folder = self._aosp_product_path
+        if not product_folder:
+            raise TestSuiteException(self._missing_path_msg)
+
+        app_folder = os.path.join(product_folder, 'data/app')
+
+        cmd = 'install {0} {1}/{2}/{2}.apk'.format(flags, app_folder, app)
+        output = self._android.adb(cmd, False, True,
+                                   util_constants.PUSH_TIMEOUT)
+        if ('Success' not in output) or ("can't find" in output):
+            raise TestSuiteException('unable to install app {}: {}'.format(
+                app, output))
+
+    def _push_all_java(self):
+        '''Push all apk files to the device.
+
+        This involves uninstalling any old installations and installing again.
+
+        Raises:
+            TestSuiteException: An apk could not be installed.
+        '''
+        for app, package in self._tests_apk.items():
+            self._install_apk(app, package)
+
+    def _push_all_ndk(self):
+        '''Push all ndk binaries to the device.
+
+        Raises:
+            TestSuiteException: A binary could not be pushed to the device or
+                                a previous process could not be killed.
+        '''
+        product_folder = self._aosp_product_path
+        if not product_folder:
+            raise TestSuiteException(self._missing_path_msg)
+
+        bin_folder = os.path.join(product_folder, 'system/bin')
+
+        for app in self._tests_ndk:
+            self._log.info('pushing {0}'.format(app))
+
+            self._android.kill_all_processes(app)
+
+            cmd = 'push %s/%s /data' % (bin_folder, app)
+            output = self._android.adb(cmd, False, True,
+                                       util_constants.PUSH_TIMEOUT)
+            if ('failed to copy' in output or
+                'No such file or directory' in output):
+                raise TestSuiteException('unable to push binary ' + app)
+
+            # be sure to set the execute bit for NDK binaries
+            self._android.shell('chmod 777 /data/{0}'.format(app))
+
+    def _push_all_jni(self):
+        '''Push all JNI apk files to the device.
+
+        This involves uninstalling any old installations and installing again.
+
+        Raises:
+            TestSuiteException: An apk could not be installed.
+        '''
+        product_folder = self._aosp_product_path
+        if not product_folder:
+            raise TestSuiteException(self._missing_path_msg)
+
+        app_folder = os.path.join(product_folder, 'system/lib')
+
+        # Ensure the system/lib directory is writable
+        self._android.make_device_writeable()
+
+        for app, package in self._tests_jni.items():
+            self._install_apk(app, package)
+
+    def delete_ndk_cache(self):
+        '''Deletes NDK cached scripts from the device.
+
+        The NDK caches compiled scripts as shared libraries in
+        the folder specified when calling `rs->init()`.
+
+        For all out tests this is set to '/data/rscache'.
+        '''
+        self._android.shell('rm -r /data/rscache')
+
+    def get_package(self, app_name):
+        '''From a given apk name get the name of its package.
+
+        Args:
+            app_name: The string that is the name of the apk.
+
+        Returns:
+            A string representing the name of the package of the app.
+
+        Raises:
+            TestSuiteException: The app name is not in the list of apks.
+        '''
+        if app_name in self._tests_apk:
+            return self._tests_apk[app_name]
+        elif app_name in self._tests_jni:
+            return self._tests_jni[app_name]
+        else:
+            msg = ('unknown app %s. (Do you need to add an '
+                  'entry to bundle.py :: test_apps_?)' % app_name)
+            raise TestSuiteException(msg)
+        return self._tests_apk[app_name]
+
+    def launch(self, app_name):
+        '''Launch an apk/ndk app on a remote device.
+
+        Args:
+            app_name: The string that is the name of the APK or NDK executable.
+
+        Returns:
+            The Process ID of the launched executable, otherwise None
+
+        Raises:
+            TestSuiteException: Previous processes of this apk could not be
+                                killed.
+        '''
+        process_name = ''
+        success = False
+        if app_name in self._tests_apk:
+            process_name = self._tests_apk[app_name]
+
+            self._android.kill_all_processes(process_name)
+
+            success = self._android.launch_app(process_name, 'MainActivity')
+        elif app_name in self._tests_ndk:
+            process_name = app_name
+            self._android.kill_all_processes(process_name)
+            success = self._android.launch_elf(process_name)
+        elif app_name in self._tests_jni:
+            package = self._tests_jni[app_name]
+
+            self._android.kill_process(package)
+
+            success = self._android.launch_app(package, 'MainActivity')
+            if not success:
+                self._log.log_and_print(app_name +
+                    ' is not installed. Try removing the --no-install option?')
+                return None
+
+            return self._android.find_app_pid(package)
+        else:
+            self._log.error('Executable {0} neither Java nor NDK.'
+                            .format(app_name))
+
+            self._log.fatal('Failed to launch test executable {0}'
+                            .format(app_name))
+            return None
+
+        if not success:
+            self._log.log_and_print(app_name +
+                ' is not installed. Try removing the --no-install option?')
+            return None
+
+        return self._android.find_app_pid(process_name)
+
+    def check_apps_installed(self, java_only):
+        ''' Check whether all Java/JNI/NDK apps are installed on the device.
+
+        Args:
+            java_only: Boolean to specify whether only the Java apks should be
+                       checked (in case of --wimpy mode for example).
+
+        Raises:
+            TestSuiteException: Not all apps are installed.
+        '''
+        java_and_jni_apks = self._tests_apk.copy()
+
+        if not java_only:
+            java_and_jni_apks.update(self._tests_jni)
+
+        installed = self._android.shell('pm list packages -f')
+
+        for app, package in java_and_jni_apks.items():
+            if package not in installed:
+                raise TestSuiteException('apk %s is not installed.' % app)
+
+        if not java_only:
+            ls_data = self._android.shell('ls /data')
+            for app in self._tests_ndk:
+                if app not in ls_data:
+                    raise TestSuiteException('app %s is not installed.' % app)

diff --git a/tests/lldb/tests/harness/util_constants.py b/tests/lldb/tests/harness/util_constants.py
new file mode 100644
index 0000000..9c7b18c
--- /dev/null
+++ b/tests/lldb/tests/harness/util_constants.py

@@ -0,0 +1,26 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''This file contains constants shared between the test suite runner and the
+individual test runner.'''
+
+from __future__ import absolute_import
+
+RC_TEST_OK = 0
+RC_TEST_TIMEOUT = 64
+RC_TEST_FAIL = 65
+RC_TEST_FATAL = 66
+RC_TEST_IGNORED = 67
+PUSH_TIMEOUT = 60*5
+

diff --git a/tests/lldb/tests/harness/util_functions.py b/tests/lldb/tests/harness/util_functions.py
new file mode 100644
index 0000000..32dca1c
--- /dev/null
+++ b/tests/lldb/tests/harness/util_functions.py

@@ -0,0 +1,52 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''This file contains utility functions used by both the test suite and the
+single test executor.'''
+
+from __future__ import absolute_import
+
+import os
+import importlib
+import sys
+
+
+def load_py_module(path):
+    '''Load a python file from disk.
+
+    Args:
+        path: String path to python file.
+
+    Returns:
+        python module if success, None otherwise.
+    '''
+    assert isinstance(path, str)
+    try:
+        if not os.path.exists(path):
+            print('Path does not exist: ' + path)
+            return None
+        path = os.path.abspath(path)
+        module_dir, module_file = os.path.split(path)
+        module_name, _ = os.path.splitext(module_file)
+        # adjust sys.path, runtime counterpart of PYTHONPATH, to temporarily
+        # include the folder containing the user configuration module
+        sys.path.append(module_dir)
+        module_obj = importlib.import_module(module_name)
+        sys.path.pop(0)
+        return module_obj
+    except ImportError as err:
+        print(str(err))
+        print("Looking in directory ")
+        print(module_dir)
+        return None

diff --git a/tests/lldb/tests/harness/util_lldb.py b/tests/lldb/tests/harness/util_lldb.py
new file mode 100644
index 0000000..2d1adca
--- /dev/null
+++ b/tests/lldb/tests/harness/util_lldb.py

@@ -0,0 +1,87 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the class UtilLLDB, which provides lldb utility
+methods.'''
+
+from __future__ import absolute_import
+
+from . import util_constants
+
+try:
+    import lldb
+except ImportError:
+    print('unable to import lldb')
+    print('please run "lldb -P" and add to $PYTHONPATH')
+    quit(util_constants.RC_TEST_FATAL)
+
+
+class UtilLLDB(object):
+    '''Provides utility methods to interface with lldb's python bindings.'''
+
+    @staticmethod
+    def start():
+        '''Initialise the lldb debugger framework.'''
+        lldb.SBDebugger_Initialize()
+
+    @staticmethod
+    def stop():
+        '''Terminate the lldb debugger framework.
+
+        Raises:
+            AssertionError: If an assertion fails.
+        '''
+        assert lldb
+        lldb.SBDebugger_Terminate()
+
+    @staticmethod
+    def create_debugger():
+        '''Create an lldb debugger instance.
+
+        Returns:
+            The SBDebugger instance that was created.
+
+        Raises:
+            AssertionError: If an assertion fails.
+        '''
+        assert lldb
+        inst = lldb.SBDebugger_Create()
+        inst.SetAsync(False)
+        return inst
+
+    @staticmethod
+    def destroy_debugger(dbg):
+        '''Destroy the lldb debugger instance.
+
+        Args:
+            dbg: Instance of SBDebugger that is to be destroyed.
+
+        Raises:
+            AssertionError: If an assertion fails.
+        '''
+        assert lldb
+        lldb.SBDebugger_Destroy(dbg)
+
+    @staticmethod
+    def get_module():
+        '''Get the lldb module.
+
+        Returns:
+            The lldb module.
+
+        Raises:
+            AssertionError: If an assertion fails.
+        '''
+        assert lldb
+        return lldb

diff --git a/tests/lldb/tests/harness/util_log.py b/tests/lldb/tests/harness/util_log.py
new file mode 100644
index 0000000..fec0703
--- /dev/null
+++ b/tests/lldb/tests/harness/util_log.py

@@ -0,0 +1,141 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Initialise the Python logging facility for the test suite.
+
+from __future__ import absolute_import
+
+It provides the function to initialise the logging facility and retrieve an
+instance of the logger class. It also contains the definition of the internal
+logger class.
+'''
+from __future__ import print_function
+
+import io
+import sys
+import logging
+
+
+INITIALISED = False
+NAMESPACE = 'RS_LLDB_TESTSUITE'
+
+def initialise(identifier, level=logging.INFO, print_to_stdout=False,
+               file_path=None, file_mode='a'):
+    '''Initialise the logging facility for the test suite.
+
+    This function should be invoked only once, at the start of the program, and
+    before emitting any log.
+
+    Args:
+        identifier: String, a label that will be part of each record. It is
+                    usually the test case name.
+        level: Integer, all messages above this log level will be discarded.
+               Valid values are those recognised by the python logging module:
+               https://docs.python.org/2/library/logging.html#levels .
+        print_to_stdout: Boolean, whether the logs should be redirected to
+                         sys.stdout (true) or stored into a text file (false).
+        file_path: String, path to the text file in which to store the logs.
+                   This option is only meaningful when print_to_stdout = False.
+        file_mode: String, the mode to open the text file. Valid modes are
+                   those recognised by the standard Python `open' function.
+                   This option is only meaningful when print_to_stdout = False.
+
+    Raises:
+        RuntimeError: If the logging has already been initialised
+        ValueError: If the argument "file_path" has not been provided when
+                    print_to_stdout=False
+    '''
+    # pylint: disable=global-statement
+    global INITIALISED
+    if INITIALISED:
+        raise RuntimeError('Already initialised')
+
+    # set the logging class
+    old_logger_class = logging.getLoggerClass()
+    logging.setLoggerClass(RsLogger)
+
+    # initialise the Logger
+    log = logging.getLogger(NAMESPACE)
+    log.setLevel(level) # reject all logs below
+
+    # don't propagate the log records to the logging root
+    log.propagate = False
+
+    # restore the previous class
+    logging.setLoggerClass(old_logger_class)
+
+    # handler
+    if print_to_stdout:
+        handler_default = logging.StreamHandler(sys.stdout)
+    else:
+        if file_path is None:
+            raise ValueError('Missing mandatory argument "file_path"')
+
+        handler_default = logging.FileHandler(file_path, file_mode)
+
+    # Do not filter records in the handler because of the level
+    handler_default.setLevel(logging.NOTSET)
+
+    # format the message
+    handler_default.setFormatter(
+        logging.Formatter(
+            '%(asctime)s [{0}] [%(levelname)s] %(message)s'
+                .format(identifier)
+    ))
+
+    log.addHandler(handler_default)
+
+    INITIALISED = True
+
+
+class RsLogger(logging.getLoggerClass()):
+    '''Internal logging class.
+
+    This is an internal class to enhance the logging facility with the methods
+    "log_and_print" and "seek_to_end".
+    '''
+    # pylint: disable=too-many-public-methods
+
+    def log_and_print(self, msg, level=logging.INFO):
+        '''Print "msg" to stdout and emit a log record.
+
+        Args:
+            msg: The message to emit.
+            level: The level to use. By default it is logging.INFO.
+        '''
+        print(msg)
+        self.log(level, msg)
+
+    def seek_to_end(self):
+        '''Reset the cursor position to the end for all handlers that are
+        Text File managers.'''
+        for hndlr in self.handlers:
+            if isinstance(hndlr, logging.FileHandler):
+                hndlr.stream.seek(0, io.SEEK_END)
+
+
+def get_logger():
+    '''Retrieves the Logger instance related to the testsuite.
+
+    Throws:
+        RuntimeError: If the logging facility has not been initialised with
+                      "initialise" beforehand.
+
+    Returns:
+        An instance of logging.Logger to write the logs.
+    '''
+    if not INITIALISED:
+        raise RuntimeError('Logging facility not initialised')
+
+    return logging.getLogger(NAMESPACE)

diff --git a/tests/lldb/tests/harness/util_timer.py b/tests/lldb/tests/harness/util_timer.py
new file mode 100644
index 0000000..b83a76f
--- /dev/null
+++ b/tests/lldb/tests/harness/util_timer.py

@@ -0,0 +1,92 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Timer utility'''
+
+from __future__ import absolute_import
+
+import threading
+
+
+class Timer(object):
+    '''A Timer utility to execute a callback after a certain interval.'''
+
+    def __init__(self, interval, callback):
+        '''Initialise the Timer without starting it.
+
+        Args:
+            interval: int or float, interval in seconds to count, before
+                invoking the callback
+            callback: function, it handles the function to call once
+                the timeout expires.
+        '''
+
+        # validate input parameters
+        if not isinstance(interval, (int, float)):
+            raise TypeError('Argument "interval" is not a number: '
+                             '{0}'.format(type(interval)))
+        if not callable(callback):
+            raise TypeError('Argument "callback" is not a function: '
+                             '{0}'.format(type(callback)))
+
+        self._timer = None
+        self._callback = callback
+        self._interval = interval
+
+    def _is_running(self):
+        '''Checks whether the timer is executing.
+
+        Returns:
+            boolean, true if the timer is currently running, false otherwise
+        '''
+        return self._timer is not None
+
+    def start(self):
+        '''Starts the timer.
+
+        Returns:
+            self, the Timer instance
+
+        Throws:
+            RuntimeError: if the timer is already running
+        '''
+        if self._is_running():
+            raise RuntimeError('Timer already running')
+
+        self._timer = threading.Timer(self._interval, self._callback)
+        self._timer.start()
+        return self # so that we can perform Timer(...).start()
+
+    def stop(self):
+        '''Stops the timer if it's executing.
+
+        Returns:
+            self, the Timer instance
+        '''
+
+        if self._is_running():
+            self._timer.cancel()
+            self._timer = None
+        return self
+
+    def reset(self):
+        '''Restart the timer.
+
+        Returns:
+            self, the Timer instance
+        '''
+
+        self.stop()
+        self.start()
+        return self

diff --git a/tests/lldb/tests/harness/util_warnings.py b/tests/lldb/tests/harness/util_warnings.py
new file mode 100644
index 0000000..dd52740
--- /dev/null
+++ b/tests/lldb/tests/harness/util_warnings.py

@@ -0,0 +1,60 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Redirect the Python warnings into the log.'''
+
+from __future__ import absolute_import
+
+import warnings
+
+from . import util_log
+
+_OLD_WARNINGS_HANDLER = None
+
+
+def redirect_warnings():
+    '''Redirect all warnings issued by warnings::warn to the log.
+
+    By default all python warnings are printed into sys.stderr. This method
+    will force to redirect them into the test suite logger.
+    '''
+
+    # pylint: disable=global-statement
+    global _OLD_WARNINGS_HANDLER
+
+    # Already redirecting?
+    if _OLD_WARNINGS_HANDLER:
+        return None
+
+    _OLD_WARNINGS_HANDLER = warnings.showwarning
+
+    log = util_log.get_logger()
+
+    def _redirect_warnings_to_log(*args):
+        '''Redirect the warnings to the Logger.'''
+        log.warn(warnings.formatwarning(*args).rstrip())
+
+    warnings.showwarning = _redirect_warnings_to_log
+
+
+def restore_warnings():
+    '''Restore the reporting of warnings::warn as before.'''
+
+    # pylint: disable=global-statement
+    global _OLD_WARNINGS_HANDLER
+
+    if _OLD_WARNINGS_HANDLER:
+        warnings.showwarning = _OLD_WARNINGS_HANDLER
+        _OLD_WARNINGS_HANDLER = None
+

diff --git a/tests/lldb/tests/run_test.py b/tests/lldb/tests/run_test.py
new file mode 100644
index 0000000..50a0530
--- /dev/null
+++ b/tests/lldb/tests/run_test.py

@@ -0,0 +1,422 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''This script will run one specific test.'''
+from __future__ import print_function, absolute_import
+
+import os
+import sys
+import atexit
+import inspect
+import logging
+import argparse
+import warnings
+
+import harness
+from harness import util_constants
+from harness import util_log
+from harness import util_warnings
+from harness.util_functions import load_py_module
+from harness.util_lldb import UtilLLDB
+from harness.exception import DisconnectedException
+from harness.exception import TestSuiteException, TestIgnoredException
+from harness.util_timer import Timer
+
+
+class TestState(object):
+    '''Simple mutable mapping (like namedtuple)'''
+    def __init__(self, **kwargs):
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+def _test_pre_run(state):
+    '''This function is called before a test is executed (setup).
+
+    Args:
+        state: Test suite state collection, instance of TestState.
+
+    Returns:
+        True if the pre_run step completed without error. Currently the pre-run
+        will launch the target test binary on the device and attach an
+        lldb-server to it in platform mode.
+
+    Raises:
+        AssertionError: If an assertion fails.
+        TestSuiteException: Previous processes of this apk required for this
+                            test could not be killed.
+    '''
+    assert state.test
+    assert state.bundle
+
+    log = util_log.get_logger()
+    log.info('running: {0}'.format(state.name))
+
+    # Remove any cached NDK scripts between tests
+    state.bundle.delete_ndk_cache()
+
+    # query our test case for the remote target app it needs
+    # First try the legacy behaviour
+    try:
+        target_name = state.test.get_bundle_target()
+        warnings.warn("get_bundle_target() is deprecated and will be removed soon"
+                      " - use the `bundle_target` dictionary attribute instead")
+    except AttributeError:
+        try:
+            target_name = state.test.bundle_target[state.bundle_type]
+        except KeyError:
+            raise TestIgnoredException()
+
+    if target_name is None:
+        # test case doesn't require a remote process to debug
+        return True
+    else:
+        # find the pid of our remote test process
+        state.pid = state.bundle.launch(target_name)
+        if not state.pid:
+            log.error('unable to get pid of target')
+            return False
+        state.android.kill_servers()
+        # spawn lldb platform on the target device
+        state.android.launch_lldb_platform(state.device_port)
+        return True
+
+
+def _test_post_run(state):
+    '''This function is called after a test is executed (cleanup).
+
+    Args:
+        state: Test suite state collection, instance of TestState.
+
+    Raises:
+        AssertionError: If an assertion fails.
+    '''
+    assert state.test
+    assert state.bundle
+
+    try:
+        target_name = state.test.get_bundle_target()
+        warnings.warn("get_bundle_target() is deprecated and will be removed soon"
+                      " - use the `bundle_target` dictionary attribute instead")
+    except AttributeError:
+        try:
+            target_name = state.test.bundle_target[state.bundle_type]
+        except KeyError:
+            raise TestIgnoredException()
+
+
+    if target_name:
+        if state.bundle.is_apk(target_name):
+            state.android.stop_app(state.bundle.get_package(target_name))
+        else:
+            state.android.kill_process(target_name)
+
+
+def _test_run(state):
+    '''Execute a single test suite.
+
+    Args:
+        state: test suite state collection, instance of TestState.
+
+    Returns:
+        True: if the test case ran successfully and passed.
+        False: if the test case failed or suffered an error.
+
+    Raises:
+        AssertionError: If an assertion fails.
+    '''
+    assert state.lldb
+    assert state.lldb_module
+    assert state.test
+
+    test_failures = state.test.run(state.lldb, state.pid, state.lldb_module)
+
+    if test_failures:
+        log = util_log.get_logger()
+        for test, err in test_failures:
+            log.error('test %s:%s failed: %r' % (state.name, test, err))
+
+        return False
+
+    return True
+
+
+def _initialise_timer(android, interval):
+    '''Start a 'timeout' timer, to catch stalled execution.
+
+    This function will start a timer that will act as a timeout killing this
+    test session if a test becomes un-responsive.
+
+    Args:
+        android: current instance of harness.UtilAndroid
+        interval: the interval for the timeout, in seconds
+
+    Returns:
+        The instance of the Timer class that was created.
+    '''
+
+    def on_timeout():
+        '''This is a callback function that will fire if a test takes longer
+        then a threshold time to complete.'''
+        # Clean up the android properties
+        android.reset_all_props()
+        # pylint: disable=protected-access
+        sys.stdout.flush()
+        # hard exit to force kill all threads that may block our exit
+        os._exit(util_constants.RC_TEST_TIMEOUT)
+
+    timer = Timer(interval, on_timeout)
+    timer.start()
+    atexit.register(Timer.stop, timer)
+    return timer
+
+
+def _quit_test(num, timer):
+    '''This function will exit making sure the timeout thread is killed.
+
+    Args:
+        num: An integer specifying the exit status, 0 meaning "successful
+             termination".
+        timer: The current Timer instance.
+    '''
+    if timer:
+        timer.stop()
+    sys.stdout.flush()
+    sys.exit(num)
+
+
+def _execute_test(state):
+    '''Execute a test suite.
+
+    Args:
+        state: The current TestState object.
+    '''
+    log = util_log.get_logger()
+
+    state.test.setup(state.android)
+    try:
+        if not _test_pre_run(state):
+            raise TestSuiteException('test_pre_run() failed')
+        if not _test_run(state):
+            raise TestSuiteException('test_run() failed')
+        _test_post_run(state)
+        log.info('Test passed')
+
+    finally:
+        state.test.post_run()
+        state.test.teardown(state.android)
+
+
+def _get_test_case_class(module):
+    '''Inspect a test case module and return the test case class.
+
+    Args:
+        module: A loaded test case module.
+    '''
+    # We consider only subclasses of TestCase that have `test_` methods`
+    log = util_log.get_logger()
+    log.debug("loading test suites from %r", module)
+    for name, klass in inspect.getmembers(module, inspect.isclass):
+        for attr in dir(klass):
+            if attr.startswith('test_'):
+                log.info("Found test class %r", name)
+                return klass
+        else:
+            log.debug("class %r has no test_ methods", name)
+    return None
+
+
+def get_test_dir(test_name):
+    ''' Get the directory that contains a test with a given name.
+
+    Returns:
+        A string that is the directory containing the test.
+
+    Raises:
+        TestSuiteException: If a test with this name does not exist.
+    '''
+    tests_dir = os.path.dirname(os.path.realpath(__file__))
+    for sub_dir in os.listdir(tests_dir):
+        current_test_dir = os.path.join(tests_dir, sub_dir)
+        if (os.path.isdir(current_test_dir) and
+            test_name in os.listdir(current_test_dir)):
+            return current_test_dir
+
+    raise TestSuiteException(
+        'unable to find test: {0}'.format(test_name))
+
+
+def main():
+    '''Test runner entry point.'''
+
+    # re-open stdout with no buffering
+    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
+
+    android = None
+    timer = None
+    log = None
+
+    # parse the command line (positional arguments only)
+    truthy = lambda x: x.lower() in ('true', '1')
+    parser = argparse.ArgumentParser("Run a single RenderScript TestSuite against lldb")
+    for name, formatter in (
+       ('test_name', str),
+       ('log_file_path', str),
+       ('adb_path', str),
+       ('lldb_server_path_device', str),
+       ('aosp_product_path', str),
+       ('device_port', int),
+       ('device', str),
+       ('print_to_stdout', truthy),
+       ('verbose', truthy),
+       ('wimpy', truthy),
+       ('timeout', int),
+       ('bundle_type', str),
+    ):
+        parser.add_argument(name, type=formatter)
+
+    args = parser.parse_args()
+
+    try:
+        # create utility classes
+        harness.util_log.initialise(
+            '%s(%s)' % (args.test_name, args.bundle_type),
+            print_to_stdout=args.print_to_stdout,
+            level=logging.INFO if not args.verbose else logging.DEBUG,
+            file_path=args.log_file_path,
+            file_mode='a'
+        )
+        log = util_log.get_logger()
+        log.debug('Logger initialised')
+
+        android = harness.UtilAndroid(args.adb_path,
+                                      args.lldb_server_path_device,
+                                      args.device)
+
+        # start the timeout counter
+        timer = _initialise_timer(android, args.timeout)
+
+        # startup lldb and register teardown handler
+        atexit.register(UtilLLDB.stop)
+        UtilLLDB.start()
+
+        current_test_dir = get_test_dir(args.test_name)
+
+        # load a test case module
+        test_module = load_py_module(os.path.join(current_test_dir,
+                                                  args.test_name))
+
+
+        # inspect the test module and locate our test case class
+        test_class = _get_test_case_class(test_module)
+
+        # if our test inherits from TestBaseRemote, check we have a valid device
+        if (hasattr(test_module, "TestBaseRemote") and
+            issubclass(test_class, test_module.TestBaseRemote)):
+            android.validate_device()
+
+        # create an instance of our test case
+        test_inst = test_class(
+            args.device_port,
+            args.device,
+            timer,
+            args.bundle_type,
+            wimpy=args.wimpy
+        )
+
+        # instantiate a test target bundle
+        bundle = harness.UtilBundle(android, args.aosp_product_path)
+
+        # execute the test case
+        try:
+            for _ in range(2):
+                try:
+                    # create an lldb instance
+                    lldb = UtilLLDB.create_debugger()
+
+                    # create state object to encapsulate instances
+
+                    state = TestState(
+                         android=android,
+                         bundle=bundle,
+                         lldb=lldb,
+                         lldb_module=UtilLLDB.get_module(),
+                         test=test_inst,
+                         pid=None,
+                         name=args.test_name,
+                         device_port=args.device_port,
+                         bundle_type=args.bundle_type
+                    )
+
+                    util_warnings.redirect_warnings()
+
+                    _execute_test(state)
+
+                    # tear down the lldb instance
+                    UtilLLDB.destroy_debugger(lldb)
+                    break
+                except DisconnectedException as error:
+                    log.warning(error)
+                    log.warning('Trying again.')
+            else:
+                log.fatal('Not trying again, maximum retries exceeded.')
+                raise TestSuiteException('Lost connection to lldb-server')
+
+        finally:
+            util_warnings.restore_warnings()
+
+        _quit_test(util_constants.RC_TEST_OK, timer)
+
+    except AssertionError:
+        if log:
+            log.critical('Internal test suite error', exc_info=1)
+        print('Internal test suite error', file=sys.stderr)
+        _quit_test(util_constants.RC_TEST_FATAL, timer)
+
+    except TestIgnoredException:
+        if log:
+            log.warn("test ignored")
+        _quit_test(util_constants.RC_TEST_IGNORED, timer)
+
+    except TestSuiteException as error:
+        if log:
+            log.exception(str(error))
+        else:
+            print(error, file=sys.stderr)
+        _quit_test(util_constants.RC_TEST_FAIL, timer)
+
+    # use a global exception handler to be sure that we will
+    # exit safely and correctly
+    except Exception:
+        if log:
+            log.exception('INTERNAL ERROR')
+        else:
+            import traceback
+            print('Exception {0}'.format(traceback.format_exc()),
+                  file=sys.stderr)
+        _quit_test(util_constants.RC_TEST_FATAL, timer)
+
+    finally:
+        if android:
+            android.reset_all_props()
+        if timer:
+            timer.stop()
+
+
+# execution trampoline
+if __name__ == '__main__':
+    print(' '.join(sys.argv))
+    main()

diff --git a/tests/lldb/tests/testcases/reduce_common.py b/tests/lldb/tests/testcases/reduce_common.py
new file mode 100644
index 0000000..462d0b3
--- /dev/null
+++ b/tests/lldb/tests/testcases/reduce_common.py

@@ -0,0 +1,102 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+REDUCE_ITERATIONS = 128  # This is in MainActivity.java
+REDUCE_STARTVAL = 10 # This is in MainActivity.java
+REDUCE_AUTO_COMB_SCRIPT = "reduce_common.rsh"
+REDUCE_SCRIPT = "reduce_common.rsh"
+X_TESTS = 100
+Y_TESTS = 2
+Z_TESTS = 2
+
+
+class ReductionMixin(object):
+    def _test_func_role_combinations(self, func_role_combinations):
+        """
+        Assert that when a reduction breakpoint is conditional on a function
+        role, that breakpoints are only set on the the given functions.
+        We do this by setting breakpoints on all possible pairs of functions
+        and check that the resolved breakpoints are on functions that are part
+        of the given pair
+        """
+        for combination in func_role_combinations:
+            self._delete_breakpoints()
+            self.try_command(
+                'language renderscript reduction breakpoint set '
+                'find_min_user_type --function-role %s' % (
+                    ','.join(combination)
+                ),
+                [r'Breakpoint(s) created']
+            )
+            func_suffixes = [combination[0][:4], combination[1][:4]]
+            # just match the first 4 chars of the roles prefix
+            funcs_match = 'find_min_user_type_((%s|%s))' % tuple(func_suffixes)
+            # now check we stop on both functions for each coordinate in the
+            # allocation
+            for x in range(REDUCE_ITERATIONS):
+                output = self.try_command(
+                    'process continue',
+                    expected_regex=[
+                        r'resuming',
+                        r'Process \d+ stopped',
+                        r'frame #0: (0x[0-9a-fA-F]+ )?librs.reduce.so`%s' % funcs_match
+                    ]
+                )
+                for line in output.splitlines():
+                    match = re.search(funcs_match, line)
+                    if match:
+                        try:
+                            func_suffixes.remove(match.group(1))
+                        except ValueError:
+                            # The outconverter may only be called in the final
+                            # step but the accumulator will be called for every
+                            # input index
+                            continue
+                        break
+                if len(func_suffixes) == 0:
+                    # We've popped the functions we're interested in off the list
+                    break
+            else:
+                raise self.TestFail(
+                    "unable to match function roles for " + repr(combination))
+
+    def _reduction_breakpoint_set_single_type(
+            self, script_soname, script_basename, reduce_name, funcname_types):
+        """
+        Assert - for each function role - that the correct symbol is resolved
+        and trapped by the debugger.
+        """
+        for func, typename in funcname_types:
+            self._delete_breakpoints()
+            breakpoint_match = r'Breakpoint \d+: where = librs.%s.so`%s'
+            # Autogenerated combiners don't have a filename in the debugger
+            if not func.endswith(".combiner"):
+                breakpoint_match = r'%s (\+ \d+ )?at %s' % (
+                        breakpoint_match, script_basename)
+            self.try_command(
+                'language renderscript reduction breakpoint set %s'
+                ' --function-role %s' % (reduce_name, typename),
+                expected_regex=[breakpoint_match % (script_soname, func)]
+            )
+            self.try_command(
+                'process continue',
+                expected_regex=[
+                    r'resuming',
+                    r'Process \d+ stopped',
+                    r'frame #0: (0x[0-9a-fA-F]+ )?librs.%s.so`%s' % (
+                        script_soname, func)
+                ]
+            )

diff --git a/tests/lldb/tests/testcases/test_allocation_dump_1.py b/tests/lldb/tests/testcases/test_allocation_dump_1.py
new file mode 100644
index 0000000..53e77fc
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_allocation_dump_1.py

@@ -0,0 +1,348 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestAllocationDump1.'''
+from __future__ import absolute_import
+
+import os
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    wimpy,
+    cpp_only_test,
+)
+
+
+class TestAllocationDump1(TestBaseRemote):
+    '''Tests printing the contents of allocations.'''
+
+    bundle_target = {
+        'java': 'Allocations',
+        'jni': 'JNIAllocations',
+        'cpp': 'CppAllocations'
+    }
+
+    @wimpy
+    @ordered_test(0)
+    def test_setup(self):
+        self.try_command('language renderscript kernel breakpoint all enable',
+                         ['Breakpoints will be set on all kernels'])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+    @wimpy
+    def test_dump_to_file1(self):
+        # Test dumping large allocations to file
+        output_file_1 = self.get_tmp_file_path()
+
+        self.try_command('language renderscript allocation dump 1 -f ' +
+                         output_file_1,
+                         ["Results written to '%s'" % output_file_1])
+
+        # Check the file was created
+        self.assert_true(os.path.isfile(output_file_1))
+        os.remove(output_file_1)
+
+    def test_dump_to_file2(self):
+        output_file_2 = self.get_tmp_file_path()
+
+        self.try_command('language renderscript allocation dump 2 -f ' +
+                         output_file_2,
+                         ["Results written to '%s'" % output_file_2])
+
+        self.assert_true(os.path.isfile(output_file_2))
+        os.remove(output_file_2)
+
+    @wimpy
+    def test_dump_char(self):
+        self.try_command('language renderscript allocation dump 3',
+                         ['(0, 0, 0) = 0',
+                          '(0, 1, 0) = 1',
+                          '(0, 2, 0) = 2',
+                          '(0, 0, 1) = 3',
+                          '(0, 1, 1) = 4',
+                          '(0, 2, 1) = 5',
+                          '(0, 0, 2) = 6',
+                          '(0, 1, 2) = 7',
+                          '(0, 2, 2) = 8',
+                          '(0, 0, 3) = 9',
+                          '(0, 1, 3) = 10',
+                          '(0, 2, 3) = 11',
+                          '(0, 0, 4) = 12',
+                          '(0, 1, 4) = 13',
+                          '(0, 2, 4) = 14',
+                          '(0, 0, 5) = 15',
+                          '(0, 1, 5) = 16',
+                          '(0, 2, 5) = 17',
+                          '(0, 0, 6) = 18',
+                          '(0, 1, 6) = 19',
+                          '(0, 2, 6) = 20',
+                          '(0, 0, 7) = 21',
+                          '(0, 1, 7) = 22',
+                          '(0, 2, 7) = 23'])
+
+    def test_dump_char2(self):
+        self.try_command('language renderscript allocation dump 4',
+                         ['(0, 0, 0) = {0 1}',
+                          '(1, 0, 0) = {2 3}',
+                          '(2, 0, 0) = {4 5}',
+                          '(3, 0, 0) = {6 7}',
+                          '(4, 0, 0) = {8 9}',
+                          '(5, 0, 0) = {10 11}',
+                          '(6, 0, 0) = {12 13}',
+                          '(7, 0, 0) = {14 15}',
+                          '(8, 0, 0) = {16 17}',
+                          '(9, 0, 0) = {18 19}',
+                          '(10, 0, 0) = {20 21}',
+                          '(11, 0, 0) = {22 23}'])
+
+    def test_dump_char3(self):
+        self.try_command('language renderscript allocation dump 5',
+                         ['(0, 0, 0) = {0 1 2}',
+                          '(1, 0, 0) = {4 5 6}',
+                          '(2, 0, 0) = {8 9 10}',
+                          '(3, 0, 0) = {12 13 14}',
+                          '(4, 0, 0) = {16 17 18}',
+                          '(5, 0, 0) = {20 21 22}'])
+
+    def test_dump_char4(self):
+        self.try_command('language renderscript allocation dump 6',
+                         ['(0, 0, 0) = {0 1 2 3}',
+                          '(1, 0, 0) = {4 5 6 7}',
+                          '(2, 0, 0) = {8 9 10 11}',
+                          '(3, 0, 0) = {12 13 14 15}',
+                          '(4, 0, 0) = {16 17 18 19}',
+                          '(5, 0, 0) = {20 21 22 23}'])
+
+    def test_dump_short(self):
+        self.try_command('language renderscript allocation dump 7',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(4, 0, 0) = 4',
+                          '(5, 0, 0) = 5',
+                          '(6, 0, 0) = 6',
+                          '(7, 0, 0) = 7',
+                          '(8, 0, 0) = 8',
+                          '(9, 0, 0) = 9',
+                          '(10, 0, 0) = 10',
+                          '(11, 0, 0) = 11',
+                          '(12, 0, 0) = 12',
+                          '(13, 0, 0) = 13',
+                          '(14, 0, 0) = 14',
+                          '(15, 0, 0) = 15',
+                          '(16, 0, 0) = 16',
+                          '(17, 0, 0) = 17',
+                          '(18, 0, 0) = 18',
+                          '(19, 0, 0) = 19',
+                          '(20, 0, 0) = 20',
+                          '(21, 0, 0) = 21',
+                          '(22, 0, 0) = 22',
+                          '(23, 0, 0) = 23'])
+
+    def test_dump_short2(self):
+        self.try_command('language renderscript allocation dump 8',
+                         ['(0, 0, 0) = {0 1}',
+                          '(1, 0, 0) = {2 3}',
+                          '(2, 0, 0) = {4 5}',
+                          '(3, 0, 0) = {6 7}',
+                          '(4, 0, 0) = {8 9}',
+                          '(5, 0, 0) = {10 11}',
+                          '(0, 0, 1) = {12 13}',
+                          '(1, 0, 1) = {14 15}',
+                          '(2, 0, 1) = {16 17}',
+                          '(3, 0, 1) = {18 19}',
+                          '(4, 0, 1) = {20 21}',
+                          '(5, 0, 1) = {22 23}'])
+
+    def test_dump_short3(self):
+        self.try_command('language renderscript allocation dump 9',
+                         ['(0, 0, 0) = {0 1 2}',
+                          '(1, 0, 0) = {4 5 6}',
+                          '(2, 0, 0) = {8 9 10}',
+                          '(3, 0, 0) = {12 13 14}',
+                          '(4, 0, 0) = {16 17 18}',
+                          '(5, 0, 0) = {20 21 22}'])
+
+    def test_dump_short4(self):
+        self.try_command('language renderscript allocation dump 10',
+                         ['(0, 0, 0) = {0 1 2 3}',
+                          '(1, 0, 0) = {4 5 6 7}',
+                          '(2, 0, 0) = {8 9 10 11}',
+                          '(3, 0, 0) = {12 13 14 15}',
+                          '(4, 0, 0) = {16 17 18 19}',
+                          '(5, 0, 0) = {20 21 22 23}'])
+
+    def test_dump_int(self):
+        self.try_command('language renderscript allocation dump 11',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(4, 0, 0) = 4',
+                          '(5, 0, 0) = 5',
+                          '(6, 0, 0) = 6',
+                          '(7, 0, 0) = 7',
+                          '(8, 0, 0) = 8',
+                          '(9, 0, 0) = 9',
+                          '(10, 0, 0) = 10',
+                          '(11, 0, 0) = 11',
+                          '(12, 0, 0) = 12',
+                          '(13, 0, 0) = 13',
+                          '(14, 0, 0) = 14',
+                          '(15, 0, 0) = 15',
+                          '(16, 0, 0) = 16',
+                          '(17, 0, 0) = 17',
+                          '(18, 0, 0) = 18',
+                          '(19, 0, 0) = 19',
+                          '(20, 0, 0) = 20',
+                          '(21, 0, 0) = 21',
+                          '(22, 0, 0) = 22',
+                          '(23, 0, 0) = 23'])
+
+    def test_dump_int2(self):
+        self.try_command('language renderscript allocation dump 12',
+                         ['(0, 0, 0) = {0 1}',
+                          '(1, 0, 0) = {2 3}',
+                          '(2, 0, 0) = {4 5}',
+                          '(3, 0, 0) = {6 7}',
+                          '(4, 0, 0) = {8 9}',
+                          '(5, 0, 0) = {10 11}',
+                          '(6, 0, 0) = {12 13}',
+                          '(7, 0, 0) = {14 15}',
+                          '(8, 0, 0) = {16 17}',
+                          '(9, 0, 0) = {18 19}',
+                          '(10, 0, 0) = {20 21}',
+                          '(11, 0, 0) = {22 23}'])
+
+    def test_dump_int3(self):
+        self.try_command('language renderscript allocation dump 13',
+                         ['(0, 0, 0) = {0 1 2}',
+                          '(1, 0, 0) = {4 5 6}',
+                          '(2, 0, 0) = {8 9 10}',
+                          '(0, 1, 0) = {12 13 14}',
+                          '(1, 1, 0) = {16 17 18}',
+                          '(2, 1, 0) = {20 21 22}'])
+
+    def test_dump_int4(self):
+        self.try_command('language renderscript allocation dump 14',
+                         ['(0, 0, 0) = {0 1 2 3}',
+                          '(1, 0, 0) = {4 5 6 7}',
+                          '(2, 0, 0) = {8 9 10 11}',
+                          '(3, 0, 0) = {12 13 14 15}',
+                          '(4, 0, 0) = {16 17 18 19}',
+                          '(5, 0, 0) = {20 21 22 23}'])
+
+    def test_dump_int5(self):
+        self.try_command('language renderscript allocation dump 15',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(4, 0, 0) = 4',
+                          '(5, 0, 0) = 5',
+                          '(6, 0, 0) = 6',
+                          '(7, 0, 0) = 7',
+                          '(8, 0, 0) = 8',
+                          '(9, 0, 0) = 9',
+                          '(10, 0, 0) = 10',
+                          '(11, 0, 0) = 11',
+                          '(12, 0, 0) = 12',
+                          '(13, 0, 0) = 13',
+                          '(14, 0, 0) = 14',
+                          '(15, 0, 0) = 15',
+                          '(16, 0, 0) = 16',
+                          '(17, 0, 0) = 17',
+                          '(18, 0, 0) = 18',
+                          '(19, 0, 0) = 19',
+                          '(20, 0, 0) = 20',
+                          '(21, 0, 0) = 21',
+                          '(22, 0, 0) = 22',
+                          '(23, 0, 0) = 23'])
+
+    def test_dump_long2(self):
+        self.try_command('language renderscript allocation dump 16',
+                         ['(0, 0, 0) = {0 1}',
+                          '(1, 0, 0) = {2 3}',
+                          '(2, 0, 0) = {4 5}',
+                          '(3, 0, 0) = {6 7}',
+                          '(4, 0, 0) = {8 9}',
+                          '(5, 0, 0) = {10 11}',
+                          '(6, 0, 0) = {12 13}',
+                          '(7, 0, 0) = {14 15}',
+                          '(8, 0, 0) = {16 17}',
+                          '(9, 0, 0) = {18 19}',
+                          '(10, 0, 0) = {20 21}',
+                          '(11, 0, 0) = {22 23}'])
+
+    def test_dump_long3(self):
+        self.try_command('language renderscript allocation dump 17',
+                         ['(0, 0, 0) = {0 1 2}',
+                          '(1, 0, 0) = {4 5 6}',
+                          '(2, 0, 0) = {8 9 10}',
+                          '(3, 0, 0) = {12 13 14}',
+                          '(4, 0, 0) = {16 17 18}',
+                          '(5, 0, 0) = {20 21 22}'])
+
+    def test_dump_long4(self):
+        self.try_command('language renderscript allocation dump 18',
+                         ['(0, 0, 0) = {0 1 2 3}',
+                          '(0, 1, 0) = {4 5 6 7}',
+                          '(0, 2, 0) = {8 9 10 11}',
+                          '(0, 3, 0) = {12 13 14 15}',
+                          '(0, 4, 0) = {16 17 18 19}',
+                          '(0, 5, 0) = {20 21 22 23}'])
+
+    def test_dump_bool(self):
+        self.try_command('language renderscript allocation dump 19',
+                         ['(0, 0, 0) = false',
+                          '(1, 0, 0) = true',
+                          '(2, 0, 0) = false',
+                          '(3, 0, 0) = true',
+                          '(4, 0, 0) = false',
+                          '(5, 0, 0) = true',
+                          '(6, 0, 0) = false',
+                          '(7, 0, 0) = true',
+                          '(8, 0, 0) = false',
+                          '(9, 0, 0) = true',
+                          '(10, 0, 0) = false',
+                          '(11, 0, 0) = true',
+                          '(12, 0, 0) = false',
+                          '(13, 0, 0) = true',
+                          '(14, 0, 0) = false',
+                          '(15, 0, 0) = true',
+                          '(16, 0, 0) = false',
+                          '(17, 0, 0) = true',
+                          '(18, 0, 0) = false',
+                          '(19, 0, 0) = true',
+                          '(20, 0, 0) = false',
+                          '(21, 0, 0) = true',
+                          '(22, 0, 0) = false',
+                          '(23, 0, 0) = true'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup_breakpoints(self):
+        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
+
+        self.try_command('breakpoint delete 2', ['1 breakpoints deleted'])
+
+        self.try_command('breakpoint delete 3', ['1 breakpoints deleted'])
+
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_allocation_dump_2.py b/tests/lldb/tests/testcases/test_allocation_dump_2.py
new file mode 100644
index 0000000..13123ec
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_allocation_dump_2.py

@@ -0,0 +1,604 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestAllocationDump2'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    wimpy,
+    ordered_test
+)
+
+
+class TestAllocationDump2(TestBaseRemote):
+    '''Tests printing the contents of allocations.'''
+
+    bundle_target = {
+        'java': 'Allocations'
+    }
+
+    @wimpy
+    @ordered_test(0)
+    def test_allocation_dump1(self):
+        # pylint: disable=line-too-long
+        self.try_command('language renderscript kernel breakpoint all enable',
+            ['Breakpoints will be set on all kernels'])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('breakpoint del 1',
+                         ['1 breakpoints deleted'])
+
+        # Hit second kernel
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        # uchar
+        self.try_command('language renderscript allocation dump 20',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(4, 0, 0) = 4',
+                          '(5, 0, 0) = 5',
+                          '(6, 0, 0) = 6',
+                          '(7, 0, 0) = 7',
+                          '(8, 0, 0) = 8',
+                          '(9, 0, 0) = 9',
+                          '(10, 0, 0) = 10',
+                          '(11, 0, 0) = 11',
+                          '(12, 0, 0) = 12',
+                          '(13, 0, 0) = 13',
+                          '(14, 0, 0) = 14',
+                          '(15, 0, 0) = 15',
+                          '(16, 0, 0) = 16',
+                          '(17, 0, 0) = 17',
+                          '(18, 0, 0) = 18',
+                          '(19, 0, 0) = 19',
+                          '(20, 0, 0) = 20',
+                          '(21, 0, 0) = 21',
+                          '(22, 0, 0) = 22',
+                          '(23, 0, 0) = 23'])
+
+    @ordered_test(1)
+    def test_allocation_dump_unsigned_types(self):
+        # uchar2
+        self.try_command('language renderscript allocation dump 21',
+                         ['(0, 0, 0) = {0x00 0x01}',
+                          '(1, 0, 0) = {0x02 0x03}',
+                          '(0, 1, 0) = {0x04 0x05}',
+                          '(1, 1, 0) = {0x06 0x07}',
+                          '(0, 2, 0) = {0x08 0x09}',
+                          '(1, 2, 0) = {0x0a 0x0b}',
+                          '(0, 3, 0) = {0x0c 0x0d}',
+                          '(1, 3, 0) = {0x0e 0x0f}',
+                          '(0, 4, 0) = {0x10 0x11}',
+                          '(1, 4, 0) = {0x12 0x13}',
+                          '(0, 5, 0) = {0x14 0x15}',
+                          '(1, 5, 0) = {0x16 0x17}'])
+
+        # uchar3
+        self.try_command('language renderscript allocation dump 22',
+                         ['(0, 0, 0) = {0x00 0x01 0x02}',
+                          '(1, 0, 0) = {0x04 0x05 0x06}',
+                          '(2, 0, 0) = {0x08 0x09 0x0a}',
+                          '(3, 0, 0) = {0x0c 0x0d 0x0e}',
+                          '(4, 0, 0) = {0x10 0x11 0x12}',
+                          '(5, 0, 0) = {0x14 0x15 0x16}'])
+
+        # uchar4
+        self.try_command('language renderscript allocation dump 23',
+                         ['(0, 0, 0) = {0x00 0x01 0x02 0x03}',
+                          '(1, 0, 0) = {0x04 0x05 0x06 0x07}',
+                          '(2, 0, 0) = {0x08 0x09 0x0a 0x0b}',
+                          '(3, 0, 0) = {0x0c 0x0d 0x0e 0x0f}',
+                          '(4, 0, 0) = {0x10 0x11 0x12 0x13}',
+                          '(5, 0, 0) = {0x14 0x15 0x16 0x17}'])
+
+        # ushort
+        self.try_command('language renderscript allocation dump 24',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(4, 0, 0) = 4',
+                          '(5, 0, 0) = 5',
+                          '(6, 0, 0) = 6',
+                          '(7, 0, 0) = 7',
+                          '(8, 0, 0) = 8',
+                          '(9, 0, 0) = 9',
+                          '(10, 0, 0) = 10',
+                          '(11, 0, 0) = 11',
+                          '(12, 0, 0) = 12',
+                          '(13, 0, 0) = 13',
+                          '(14, 0, 0) = 14',
+                          '(15, 0, 0) = 15',
+                          '(16, 0, 0) = 16',
+                          '(17, 0, 0) = 17',
+                          '(18, 0, 0) = 18',
+                          '(19, 0, 0) = 19',
+                          '(20, 0, 0) = 20',
+                          '(21, 0, 0) = 21',
+                          '(22, 0, 0) = 22',
+                          '(23, 0, 0) = 23'])
+
+        # ushort2
+        self.try_command('language renderscript allocation dump 25',
+                         ['(0, 0, 0) = {0x0000 0x0001}',
+                          '(1, 0, 0) = {0x0002 0x0003}',
+                          '(2, 0, 0) = {0x0004 0x0005}',
+                          '(3, 0, 0) = {0x0006 0x0007}',
+                          '(4, 0, 0) = {0x0008 0x0009}',
+                          '(5, 0, 0) = {0x000a 0x000b}',
+                          '(6, 0, 0) = {0x000c 0x000d}',
+                          '(7, 0, 0) = {0x000e 0x000f}',
+                          '(8, 0, 0) = {0x0010 0x0011}',
+                          '(9, 0, 0) = {0x0012 0x0013}',
+                          '(10, 0, 0) = {0x0014 0x0015}',
+                          '(11, 0, 0) = {0x0016 0x0017}'])
+
+        # ushort3
+        self.try_command('language renderscript allocation dump 26',
+                         ['(0, 0, 0) = {0x0000 0x0001 0x0002}',
+                          '(0, 1, 0) = {0x0004 0x0005 0x0006}',
+                          '(0, 2, 0) = {0x0008 0x0009 0x000a}',
+                          '(0, 3, 0) = {0x000c 0x000d 0x000e}',
+                          '(0, 4, 0) = {0x0010 0x0011 0x0012}',
+                          '(0, 5, 0) = {0x0014 0x0015 0x0016}'])
+
+        # ushort4
+        self.try_command('language renderscript allocation dump 27',
+                         ['(0, 0, 0) = {0x0000 0x0001 0x0002 0x0003}',
+                          '(1, 0, 0) = {0x0004 0x0005 0x0006 0x0007}',
+                          '(2, 0, 0) = {0x0008 0x0009 0x000a 0x000b}',
+                          '(3, 0, 0) = {0x000c 0x000d 0x000e 0x000f}',
+                          '(4, 0, 0) = {0x0010 0x0011 0x0012 0x0013}',
+                          '(5, 0, 0) = {0x0014 0x0015 0x0016 0x0017}'])
+
+        # uint
+        self.try_command('language renderscript allocation dump 28',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(4, 0, 0) = 4',
+                          '(5, 0, 0) = 5',
+                          '(6, 0, 0) = 6',
+                          '(7, 0, 0) = 7',
+                          '(8, 0, 0) = 8',
+                          '(9, 0, 0) = 9',
+                          '(10, 0, 0) = 10',
+                          '(11, 0, 0) = 11',
+                          '(12, 0, 0) = 12',
+                          '(13, 0, 0) = 13',
+                          '(14, 0, 0) = 14',
+                          '(15, 0, 0) = 15',
+                          '(16, 0, 0) = 16',
+                          '(17, 0, 0) = 17',
+                          '(18, 0, 0) = 18',
+                          '(19, 0, 0) = 19',
+                          '(20, 0, 0) = 20',
+                          '(21, 0, 0) = 21',
+                          '(22, 0, 0) = 22',
+                          '(23, 0, 0) = 23'])
+
+        # uint2
+        self.try_command('language renderscript allocation dump 29',
+                         ['(0, 0, 0) = {0x00000000 0x00000001}',
+                          '(1, 0, 0) = {0x00000002 0x00000003}',
+                          '(2, 0, 0) = {0x00000004 0x00000005}',
+                          '(3, 0, 0) = {0x00000006 0x00000007}',
+                          '(4, 0, 0) = {0x00000008 0x00000009}',
+                          '(5, 0, 0) = {0x0000000a 0x0000000b}',
+                          '(6, 0, 0) = {0x0000000c 0x0000000d}',
+                          '(7, 0, 0) = {0x0000000e 0x0000000f}',
+                          '(8, 0, 0) = {0x00000010 0x00000011}',
+                          '(9, 0, 0) = {0x00000012 0x00000013}',
+                          '(10, 0, 0) = {0x00000014 0x00000015}',
+                          '(11, 0, 0) = {0x00000016 0x00000017}'])
+
+        # uint3
+        self.try_command('language renderscript allocation dump 30',
+                         ['(0, 0, 0) = {0x00000000 0x00000001 0x00000002}',
+                          '(1, 0, 0) = {0x00000004 0x00000005 0x00000006}',
+                          '(2, 0, 0) = {0x00000008 0x00000009 0x0000000a}',
+                          '(3, 0, 0) = {0x0000000c 0x0000000d 0x0000000e}',
+                          '(4, 0, 0) = {0x00000010 0x00000011 0x00000012}',
+                          '(5, 0, 0) = {0x00000014 0x00000015 0x00000016}'])
+
+        # uint4
+        self.try_command('language renderscript allocation dump 31',
+                         ['(0, 0, 0) = {0x00000000 0x00000001 0x00000002 0x00000003}',
+                          '(0, 0, 1) = {0x00000004 0x00000005 0x00000006 0x00000007}',
+                          '(0, 0, 2) = {0x00000008 0x00000009 0x0000000a 0x0000000b}',
+                          '(0, 0, 3) = {0x0000000c 0x0000000d 0x0000000e 0x0000000f}',
+                          '(0, 0, 4) = {0x00000010 0x00000011 0x00000012 0x00000013}',
+                          '(0, 0, 5) = {0x00000014 0x00000015 0x00000016 0x00000017}'])
+
+        # ulong
+        self.try_command('language renderscript allocation dump 32',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(0, 1, 0) = 4',
+                          '(1, 1, 0) = 5',
+                          '(2, 1, 0) = 6',
+                          '(3, 1, 0) = 7',
+                          '(0, 2, 0) = 8',
+                          '(1, 2, 0) = 9',
+                          '(2, 2, 0) = 10',
+                          '(3, 2, 0) = 11',
+                          '(0, 0, 1) = 12',
+                          '(1, 0, 1) = 13',
+                          '(2, 0, 1) = 14',
+                          '(3, 0, 1) = 15',
+                          '(0, 1, 1) = 16',
+                          '(1, 1, 1) = 17',
+                          '(2, 1, 1) = 18',
+                          '(3, 1, 1) = 19',
+                          '(0, 2, 1) = 20',
+                          '(1, 2, 1) = 21',
+                          '(2, 2, 1) = 22',
+                          '(3, 2, 1) = 23'])
+
+        # ulong2
+        self.try_command('language renderscript allocation dump 33',
+                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001}',
+                          '(1, 0, 0) = {0x0000000000000002 0x0000000000000003}',
+                          '(2, 0, 0) = {0x0000000000000004 0x0000000000000005}',
+                          '(3, 0, 0) = {0x0000000000000006 0x0000000000000007}',
+                          '(4, 0, 0) = {0x0000000000000008 0x0000000000000009}',
+                          '(5, 0, 0) = {0x000000000000000a 0x000000000000000b}',
+                          '(6, 0, 0) = {0x000000000000000c 0x000000000000000d}',
+                          '(7, 0, 0) = {0x000000000000000e 0x000000000000000f}',
+                          '(8, 0, 0) = {0x0000000000000010 0x0000000000000011}',
+                          '(9, 0, 0) = {0x0000000000000012 0x0000000000000013}',
+                          '(10, 0, 0) = {0x0000000000000014 0x0000000000000015}',
+                          '(11, 0, 0) = {0x0000000000000016 0x0000000000000017}'])
+
+        # ulong3
+        self.try_command('language renderscript allocation dump 34',
+                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001 0x0000000000000002}',
+                          '(1, 0, 0) = {0x0000000000000004 0x0000000000000005 0x0000000000000006}',
+                          '(2, 0, 0) = {0x0000000000000008 0x0000000000000009 0x000000000000000a}',
+                          '(3, 0, 0) = {0x000000000000000c 0x000000000000000d 0x000000000000000e}',
+                          '(4, 0, 0) = {0x0000000000000010 0x0000000000000011 0x0000000000000012}',
+                          '(5, 0, 0) = {0x0000000000000014 0x0000000000000015 0x0000000000000016}'])
+
+        # ulong4
+        self.try_command('language renderscript allocation dump 35',
+                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001 '
+                                       '0x0000000000000002 0x0000000000000003}',
+                          '(1, 0, 0) = {0x0000000000000004 0x0000000000000005 '
+                                       '0x0000000000000006 0x0000000000000007}',
+                          '(2, 0, 0) = {0x0000000000000008 0x0000000000000009 '
+                                       '0x000000000000000a 0x000000000000000b}',
+                          '(3, 0, 0) = {0x000000000000000c 0x000000000000000d '
+                                       '0x000000000000000e 0x000000000000000f}',
+                          '(4, 0, 0) = {0x0000000000000010 0x0000000000000011 '
+                                       '0x0000000000000012 0x0000000000000013}',
+                          '(5, 0, 0) = {0x0000000000000014 0x0000000000000015 '
+                                       '0x0000000000000016 0x0000000000000017}'])
+
+    @wimpy
+    @ordered_test(3)
+    def test_dump_square_kernel(self):
+        self.try_command('breakpoint del 2',
+                         ['1 breakpoints deleted'])
+
+        # Hit third kernel
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        # Test that uint allocation has been squared by square_kernel
+        self.try_command('language renderscript allocation dump 28',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 4',
+                          '(3, 0, 0) = 9',
+                          '(4, 0, 0) = 16',
+                          '(5, 0, 0) = 25',
+                          '(6, 0, 0) = 36',
+                          '(7, 0, 0) = 49',
+                          '(8, 0, 0) = 64',
+                          '(9, 0, 0) = 81',
+                          '(10, 0, 0) = 100',
+                          '(11, 0, 0) = 121',
+                          '(12, 0, 0) = 144',
+                          '(13, 0, 0) = 169',
+                          '(14, 0, 0) = 196',
+                          '(15, 0, 0) = 225',
+                          '(16, 0, 0) = 256',
+                          '(17, 0, 0) = 289',
+                          '(18, 0, 0) = 324',
+                          '(19, 0, 0) = 361',
+                          '(20, 0, 0) = 400',
+                          '(21, 0, 0) = 441',
+                          '(22, 0, 0) = 484',
+                          '(23, 0, 0) = 529'])
+
+    @ordered_test(4)
+    def test_alloction_dump_floating_types(self):
+        # half
+        self.try_command('language renderscript allocation dump 36',
+                         ['(0, 0, 0) = 1',
+                          '(1, 0, 0) = 1.00098',
+                          '(2, 0, 0) = 1.00195',
+                          '(3, 0, 0) = 1.00293',
+                          '(4, 0, 0) = 1.00391',
+                          '(5, 0, 0) = 1.00488',
+                          '(6, 0, 0) = 1.00586',
+                          '(7, 0, 0) = 1.00684',
+                          '(8, 0, 0) = 1.00781',
+                          '(9, 0, 0) = 1.00879',
+                          '(10, 0, 0) = 1.00977',
+                          '(11, 0, 0) = 1.01074',
+                          '(12, 0, 0) = 1.01172',
+                          '(13, 0, 0) = 1.0127',
+                          '(14, 0, 0) = 1.01367',
+                          '(15, 0, 0) = 1.01465',
+                          '(16, 0, 0) = 1.0156',
+                          '(17, 0, 0) = 1.0166',
+                          '(18, 0, 0) = 1.01758',
+                          '(19, 0, 0) = 1.01855',
+                          '(20, 0, 0) = 1.01953',
+                          '(21, 0, 0) = 1.02051',
+                          '(22, 0, 0) = 1.02148',
+                          '(23, 0, 0) = 1.02246'])
+
+        # half2
+        self.try_command('language renderscript allocation dump 37',
+                         ['(0, 0, 0) = {1 1.00098}',
+                          '(1, 0, 0) = {1.00195 1.00293}',
+                          '(2, 0, 0) = {1.00391 1.00488}',
+                          '(3, 0, 0) = {1.00586 1.00684}',
+                          '(4, 0, 0) = {1.00781 1.00879}',
+                          '(5, 0, 0) = {1.00977 1.01074}',
+                          '(6, 0, 0) = {1.01172 1.0127}',
+                          '(7, 0, 0) = {1.01367 1.01465}',
+                          '(9, 0, 0) = {1.01758 1.01855}',
+                          '(10, 0, 0) = {1.01953 1.02051}',
+                          '(11, 0, 0) = {1.02148 1.02246}'],
+                          [r'\(8, 0, 0\) = \{1\.0156[23] 1\.0166\}'])
+
+        # half3
+        self.try_command('language renderscript allocation dump 38',
+                         ['(0, 0, 0) = {1 1.00098 1.00195}',
+                          '(0, 1, 0) = {1.00391 1.00488 1.00586}',
+                          '(0, 2, 0) = {1.00781 1.00879 1.00977}',
+                          '(0, 3, 0) = {1.01172 1.0127 1.01367}',
+                          '(0, 5, 0) = {1.01953 1.02051 1.02148}'],
+                        [r'\(0, 4, 0\) = \{1\.0156[23] 1\.0166 1\.01758\}'])
+
+        # half4
+        self.try_command('language renderscript allocation dump 39',
+                         ['(0, 0, 0) = {1 1.00098 1.00195 1.00293}',
+                          '(1, 0, 0) = {1.00391 1.00488 1.00586 1.00684}',
+                          '(2, 0, 0) = {1.00781 1.00879 1.00977 1.01074}',
+                          '(3, 0, 0) = {1.01172 1.0127 1.01367 1.01465}',
+                          '(5, 0, 0) = {1.01953 1.02051 1.02148 1.02246}'],
+                         [r'\(4, 0, 0\) = \{1\.0156[23] 1\.0166 1\.01758 1\.01855\}'])
+
+        # float
+        self.try_command('language renderscript allocation dump 40',
+                         ['(0, 0, 0) = inf',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 0.5',
+                          '(3, 0, 0) = 0.333333',
+                          '(4, 0, 0) = 0.25',
+                          '(5, 0, 0) = 0.2',
+                          '(6, 0, 0) = 0.166667',
+                          '(7, 0, 0) = 0.142857',
+                          '(8, 0, 0) = 0.125',
+                          '(9, 0, 0) = 0.111111',
+                          '(10, 0, 0) = 0.1',
+                          '(11, 0, 0) = 0.0909091',
+                          '(12, 0, 0) = 0.0833333',
+                          '(13, 0, 0) = 0.0769231',
+                          '(14, 0, 0) = 0.0714286',
+                          '(15, 0, 0) = 0.0666667',
+                          '(16, 0, 0) = 0.0625',
+                          '(17, 0, 0) = 0.0588235',
+                          '(18, 0, 0) = 0.0555556',
+                          '(19, 0, 0) = 0.0526316',
+                          '(20, 0, 0) = 0.05',
+                          '(21, 0, 0) = 0.047619',
+                          '(22, 0, 0) = 0.0454545',
+                          '(23, 0, 0) = 0.0434783'])
+
+        # float2
+        self.try_command('language renderscript allocation dump 41',
+                         ['(0, 0, 0) = {inf 1}',
+                          '(1, 0, 0) = {0.5 0.333333}',
+                          '(2, 0, 0) = {0.25 0.2}',
+                          '(3, 0, 0) = {0.166667 0.142857}',
+                          '(4, 0, 0) = {0.125 0.111111}',
+                          '(5, 0, 0) = {0.1 0.0909091}',
+                          '(6, 0, 0) = {0.0833333 0.0769231}',
+                          '(7, 0, 0) = {0.0714286 0.0666667}',
+                          '(8, 0, 0) = {0.0625 0.0588235}',
+                          '(9, 0, 0) = {0.0555556 0.0526316}',
+                          '(10, 0, 0) = {0.05 0.047619}',
+                          '(11, 0, 0) = {0.0454545 0.0434783}'])
+
+        # float3
+        self.try_command('language renderscript allocation dump 42',
+                         ['(0, 0, 0) = {inf 1 0.5}',
+                          '(1, 0, 0) = {0.25 0.2 0.166667}',
+                          '(2, 0, 0) = {0.125 0.111111 0.1}',
+                          '(3, 0, 0) = {0.0833333 0.0769231 0.0714286}',
+                          '(4, 0, 0) = {0.0625 0.0588235 0.0555556}',
+                          '(5, 0, 0) = {0.05 0.047619 0.0454545}'])
+
+        # float4
+        self.try_command('language renderscript allocation dump 43',
+                         ['(0, 0, 0) = {inf 1 0.5 0.333333}',
+                          '(1, 0, 0) = {0.25 0.2 0.166667 0.142857}',
+                          '(2, 0, 0) = {0.125 0.111111 0.1 0.0909091}',
+                          '(0, 1, 0) = {0.0833333 0.0769231 0.0714286 0.0666667}',
+                          '(1, 1, 0) = {0.0625 0.0588235 0.0555556 0.0526316}',
+                          '(2, 1, 0) = {0.05 0.047619 0.0454545 0.0434783}'])
+
+        # double
+        self.try_command('language renderscript allocation dump 44',
+                         ['(0, 0, 0) = inf',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 0.5',
+                          '(3, 0, 0) = 0.333333333333333',
+                          '(4, 0, 0) = 0.25',
+                          '(5, 0, 0) = 0.2',
+                          '(6, 0, 0) = 0.166666666666667',
+                          '(7, 0, 0) = 0.142857142857143',
+                          '(8, 0, 0) = 0.125',
+                          '(9, 0, 0) = 0.111111111111111',
+                          '(10, 0, 0) = 0.1',
+                          '(11, 0, 0) = 0.0909090909090909',
+                          '(12, 0, 0) = 0.0833333333333333',
+                          '(13, 0, 0) = 0.0769230769230769',
+                          '(14, 0, 0) = 0.0714285714285714',
+                          '(15, 0, 0) = 0.0666666666666667',
+                          '(16, 0, 0) = 0.0625',
+                          '(17, 0, 0) = 0.0588235294117647',
+                          '(18, 0, 0) = 0.0555555555555556',
+                          '(19, 0, 0) = 0.0526315789473684',
+                          '(20, 0, 0) = 0.05',
+                          '(21, 0, 0) = 0.0476190476190476',
+                          '(22, 0, 0) = 0.0454545454545455',
+                          '(23, 0, 0) = 0.0434782608695652'])
+
+        # double2
+        self.try_command('language renderscript allocation dump 45',
+                         ['(0, 0, 0) = {inf 1}',
+                          '(1, 0, 0) = {0.5 0.333333333333333}',
+                          '(2, 0, 0) = {0.25 0.2}',
+                          '(3, 0, 0) = {0.166666666666667 0.142857142857143}',
+                          '(0, 0, 1) = {0.125 0.111111111111111}',
+                          '(1, 0, 1) = {0.1 0.0909090909090909}',
+                          '(2, 0, 1) = {0.0833333333333333 0.0769230769230769}',
+                          '(3, 0, 1) = {0.0714285714285714 0.0666666666666667}',
+                          '(0, 0, 2) = {0.0625 0.0588235294117647}',
+                          '(1, 0, 2) = {0.0555555555555556 0.0526315789473684}',
+                          '(2, 0, 2) = {0.05 0.0476190476190476}',
+                          '(3, 0, 2) = {0.0454545454545455 0.0434782608695652}'])
+
+        # double3
+        self.try_command('language renderscript allocation dump 46',
+                         ['(0, 0, 0) = {inf 1 0.5}',
+                          '(0, 1, 0) = {0.25 0.2 0.166666666666667}',
+                          '(0, 0, 1) = {0.125 0.111111111111111 0.1}',
+                          '(0, 1, 1) = {0.0833333333333333 0.0769230769230769 '
+                                       '0.0714285714285714}',
+                          '(0, 0, 2) = {0.0625 0.0588235294117647 0.0555555555555556}',
+                          '(0, 1, 2) = {0.05 0.0476190476190476 0.0454545454545455}'])
+
+        # double4
+        self.try_command('language renderscript allocation dump 47',
+                         ['(0, 0, 0) = {inf 1 0.5 0.333333333333333}',
+                          '(0, 1, 0) = {0.25 0.2 0.166666666666667 0.142857142857143}',
+                          '(0, 0, 1) = {0.125 0.111111111111111 0.1 0.0909090909090909}',
+                          '(0, 1, 1) = {0.0833333333333333 0.0769230769230769 '
+                                       '0.0714285714285714 0.0666666666666667}',
+                          '(0, 0, 2) = {0.0625 0.0588235294117647 '
+                                       '0.0555555555555556 0.0526315789473684}',
+                          '(0, 1, 2) = {0.05 0.0476190476190476 '
+                                       '0.0454545454545455 0.0434782608695652}'])
+
+    @wimpy
+    @ordered_test(5)
+    def test_allocation_dump_half_kernel(self):
+        # Delete kernel breakpoint on add_half_kernel
+        self.try_command('breakpoint del 3',
+                         ['1 breakpoints deleted'])
+
+        # Hit struct_kernel
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        # Double 3 has been modified by add_half_kernel
+        self.try_command('language renderscript allocation dump 46',
+                         ['(0, 0, 0) = {inf 1.5 1}',
+                          '(0, 1, 0) = {0.75 0.7 0.666666666666667}',
+                          '(0, 0, 1) = {0.625 0.611111111111111 0.6}',
+                          '(0, 1, 1) = {0.583333333333333 0.576923076923077 0.571428571428571}',
+                          '(0, 0, 2) = {0.5625 0.558823529411765 0.555555555555556}',
+                          '(0, 1, 2) = {0.55 0.547619047619048 0.545454545454545}'])
+
+        # Floating point allocation data should have been overwritten
+        self.try_command('language renderscript allocation dump 40',
+                         ['(0, 0, 0) = -inf',
+                          '(1, 0, 0) = -1',
+                          '(2, 0, 0) = -0.5',
+                          '(3, 0, 0) = -0.333333',
+                          '(4, 0, 0) = -0.25',
+                          '(5, 0, 0) = -0.2',
+                          '(6, 0, 0) = -0.166667',
+                          '(7, 0, 0) = -0.142857',
+                          '(8, 0, 0) = -0.125',
+                          '(9, 0, 0) = -0.111111',
+                          '(10, 0, 0) = -0.1',
+                          '(11, 0, 0) = -0.0909091',
+                          '(12, 0, 0) = -0.0833333',
+                          '(13, 0, 0) = -0.0769231',
+                          '(14, 0, 0) = -0.0714286',
+                          '(15, 0, 0) = -0.0666667',
+                          '(16, 0, 0) = -0.0625',
+                          '(17, 0, 0) = -0.0588235',
+                          '(18, 0, 0) = -0.0555556',
+                          '(19, 0, 0) = -0.0526316',
+                          '(20, 0, 0) = -0.05',
+                          '(21, 0, 0) = -0.047619',
+                          '(22, 0, 0) = -0.0454545',
+                          '(23, 0, 0) = -0.0434783'])
+
+        self.try_command('language renderscript allocation dump 41',
+                         ['(0, 0, 0) = {-inf -1}',
+                          '(1, 0, 0) = {-0.5 -0.333333}',
+                          '(2, 0, 0) = {-0.25 -0.2}',
+                          '(3, 0, 0) = {-0.166667 -0.142857}',
+                          '(4, 0, 0) = {-0.125 -0.111111}',
+                          '(5, 0, 0) = {-0.1 -0.0909091}',
+                          '(6, 0, 0) = {-0.0833333 -0.0769231}',
+                          '(7, 0, 0) = {-0.0714286 -0.0666667}',
+                          '(8, 0, 0) = {-0.0625 -0.0588235}',
+                          '(9, 0, 0) = {-0.0555556 -0.0526316}',
+                          '(10, 0, 0) = {-0.05 -0.047619}',
+                          '(11, 0, 0) = {-0.0454545 -0.0434783}'])
+
+        self.try_command('language renderscript allocation dump 42',
+                         ['(0, 0, 0) = {-inf -1 -0.5}',
+                          '(1, 0, 0) = {-0.25 -0.2 -0.166667}',
+                          '(2, 0, 0) = {-0.125 -0.111111 -0.1}',
+                          '(3, 0, 0) = {-0.0833333 -0.0769231 -0.0714286}',
+                          '(4, 0, 0) = {-0.0625 -0.0588235 -0.0555556}',
+                          '(5, 0, 0) = {-0.05 -0.047619 -0.0454545}'])
+
+        self.try_command('language renderscript allocation dump 43',
+                         ['(0, 0, 0) = {-inf -1 -0.5 -0.333333}',
+                          '(1, 0, 0) = {-0.25 -0.2 -0.166667 -0.142857}',
+                          '(2, 0, 0) = {-0.125 -0.111111 -0.1 -0.0909091}',
+                          '(0, 1, 0) = {-0.0833333 -0.0769231 -0.0714286 -0.0666667}',
+                          '(1, 1, 0) = {-0.0625 -0.0588235 -0.0555556 -0.0526316}',
+                          '(2, 1, 0) = {-0.05 -0.047619 -0.0454545 -0.0434783}'])

diff --git a/tests/lldb/tests/testcases/test_allocation_dump_2_cpp.py b/tests/lldb/tests/testcases/test_allocation_dump_2_cpp.py
new file mode 100644
index 0000000..b103ccb
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_allocation_dump_2_cpp.py

@@ -0,0 +1,525 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestAllocationDump2Cpp.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+
+
+class TestAllocationDump2Cpp(TestBaseRemote):
+    '''Tests printing the contents of allocations in an NDK app.'''
+
+    bundle_target = {
+        'cpp': 'CppAllocations'
+    }
+
+    def test_case(self):
+        '''Run the lldb commands that are being tested.
+
+        Raises:
+            TestFail: One of the lldb commands did not provide the expected
+            output.
+        '''
+        # pylint: disable=line-too-long
+        self.try_command('language renderscript kernel breakpoint all enable',
+                         ['Breakpoints will be set on all kernels'])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('breakpoint del 1',
+                         ['1 breakpoints deleted'])
+
+        # Hit second kernel
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        # uchar
+        self.try_command('language renderscript allocation dump 20',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(4, 0, 0) = 4',
+                          '(5, 0, 0) = 5',
+                          '(6, 0, 0) = 6',
+                          '(7, 0, 0) = 7',
+                          '(8, 0, 0) = 8',
+                          '(9, 0, 0) = 9',
+                          '(10, 0, 0) = 10',
+                          '(11, 0, 0) = 11',
+                          '(12, 0, 0) = 12',
+                          '(13, 0, 0) = 13',
+                          '(14, 0, 0) = 14',
+                          '(15, 0, 0) = 15',
+                          '(16, 0, 0) = 16',
+                          '(17, 0, 0) = 17',
+                          '(18, 0, 0) = 18',
+                          '(19, 0, 0) = 19',
+                          '(20, 0, 0) = 20',
+                          '(21, 0, 0) = 21',
+                          '(22, 0, 0) = 22',
+                          '(23, 0, 0) = 23'])
+
+        # uchar2
+        self.try_command('language renderscript allocation dump 21',
+                         ['(0, 0, 0) = {0x00 0x01}',
+                          '(1, 0, 0) = {0x02 0x03}',
+                          '(0, 1, 0) = {0x04 0x05}',
+                          '(1, 1, 0) = {0x06 0x07}',
+                          '(0, 2, 0) = {0x08 0x09}',
+                          '(1, 2, 0) = {0x0a 0x0b}',
+                          '(0, 3, 0) = {0x0c 0x0d}',
+                          '(1, 3, 0) = {0x0e 0x0f}',
+                          '(0, 4, 0) = {0x10 0x11}',
+                          '(1, 4, 0) = {0x12 0x13}',
+                          '(0, 5, 0) = {0x14 0x15}',
+                          '(1, 5, 0) = {0x16 0x17}'])
+
+        # uchar3
+        self.try_command('language renderscript allocation dump 22',
+                         ['(0, 0, 0) = {0x00 0x01 0x02}',
+                          '(1, 0, 0) = {0x04 0x05 0x06}',
+                          '(2, 0, 0) = {0x08 0x09 0x0a}',
+                          '(3, 0, 0) = {0x0c 0x0d 0x0e}',
+                          '(4, 0, 0) = {0x10 0x11 0x12}',
+                          '(5, 0, 0) = {0x14 0x15 0x16}'])
+
+        # uchar4
+        self.try_command('language renderscript allocation dump 23',
+                         ['(0, 0, 0) = {0x00 0x01 0x02 0x03}',
+                          '(1, 0, 0) = {0x04 0x05 0x06 0x07}',
+                          '(2, 0, 0) = {0x08 0x09 0x0a 0x0b}',
+                          '(3, 0, 0) = {0x0c 0x0d 0x0e 0x0f}',
+                          '(4, 0, 0) = {0x10 0x11 0x12 0x13}',
+                          '(5, 0, 0) = {0x14 0x15 0x16 0x17}'])
+
+        # ushort
+        self.try_command('language renderscript allocation dump 24',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(4, 0, 0) = 4',
+                          '(5, 0, 0) = 5',
+                          '(6, 0, 0) = 6',
+                          '(7, 0, 0) = 7',
+                          '(8, 0, 0) = 8',
+                          '(9, 0, 0) = 9',
+                          '(10, 0, 0) = 10',
+                          '(11, 0, 0) = 11',
+                          '(12, 0, 0) = 12',
+                          '(13, 0, 0) = 13',
+                          '(14, 0, 0) = 14',
+                          '(15, 0, 0) = 15',
+                          '(16, 0, 0) = 16',
+                          '(17, 0, 0) = 17',
+                          '(18, 0, 0) = 18',
+                          '(19, 0, 0) = 19',
+                          '(20, 0, 0) = 20',
+                          '(21, 0, 0) = 21',
+                          '(22, 0, 0) = 22',
+                          '(23, 0, 0) = 23'])
+
+        # ushort2
+        self.try_command('language renderscript allocation dump 25',
+                         ['(0, 0, 0) = {0x0000 0x0001}',
+                          '(1, 0, 0) = {0x0002 0x0003}',
+                          '(2, 0, 0) = {0x0004 0x0005}',
+                          '(3, 0, 0) = {0x0006 0x0007}',
+                          '(4, 0, 0) = {0x0008 0x0009}',
+                          '(5, 0, 0) = {0x000a 0x000b}',
+                          '(6, 0, 0) = {0x000c 0x000d}',
+                          '(7, 0, 0) = {0x000e 0x000f}',
+                          '(8, 0, 0) = {0x0010 0x0011}',
+                          '(9, 0, 0) = {0x0012 0x0013}',
+                          '(10, 0, 0) = {0x0014 0x0015}',
+                          '(11, 0, 0) = {0x0016 0x0017}'])
+
+        # ushort3
+        self.try_command('language renderscript allocation dump 26',
+                         ['(0, 0, 0) = {0x0000 0x0001 0x0002}',
+                          '(0, 1, 0) = {0x0004 0x0005 0x0006}',
+                          '(0, 2, 0) = {0x0008 0x0009 0x000a}',
+                          '(0, 3, 0) = {0x000c 0x000d 0x000e}',
+                          '(0, 4, 0) = {0x0010 0x0011 0x0012}',
+                          '(0, 5, 0) = {0x0014 0x0015 0x0016}'])
+
+        # ushort4
+        self.try_command('language renderscript allocation dump 27',
+                         ['(0, 0, 0) = {0x0000 0x0001 0x0002 0x0003}',
+                          '(1, 0, 0) = {0x0004 0x0005 0x0006 0x0007}',
+                          '(2, 0, 0) = {0x0008 0x0009 0x000a 0x000b}',
+                          '(3, 0, 0) = {0x000c 0x000d 0x000e 0x000f}',
+                          '(4, 0, 0) = {0x0010 0x0011 0x0012 0x0013}',
+                          '(5, 0, 0) = {0x0014 0x0015 0x0016 0x0017}'])
+
+        # uint
+        self.try_command('language renderscript allocation dump 28',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(4, 0, 0) = 4',
+                          '(5, 0, 0) = 5',
+                          '(6, 0, 0) = 6',
+                          '(7, 0, 0) = 7',
+                          '(8, 0, 0) = 8',
+                          '(9, 0, 0) = 9',
+                          '(10, 0, 0) = 10',
+                          '(11, 0, 0) = 11',
+                          '(12, 0, 0) = 12',
+                          '(13, 0, 0) = 13',
+                          '(14, 0, 0) = 14',
+                          '(15, 0, 0) = 15',
+                          '(16, 0, 0) = 16',
+                          '(17, 0, 0) = 17',
+                          '(18, 0, 0) = 18',
+                          '(19, 0, 0) = 19',
+                          '(20, 0, 0) = 20',
+                          '(21, 0, 0) = 21',
+                          '(22, 0, 0) = 22',
+                          '(23, 0, 0) = 23'])
+
+        # uint2
+        self.try_command('language renderscript allocation dump 29',
+                         ['(0, 0, 0) = {0x00000000 0x00000001}',
+                          '(1, 0, 0) = {0x00000002 0x00000003}',
+                          '(2, 0, 0) = {0x00000004 0x00000005}',
+                          '(3, 0, 0) = {0x00000006 0x00000007}',
+                          '(4, 0, 0) = {0x00000008 0x00000009}',
+                          '(5, 0, 0) = {0x0000000a 0x0000000b}',
+                          '(6, 0, 0) = {0x0000000c 0x0000000d}',
+                          '(7, 0, 0) = {0x0000000e 0x0000000f}',
+                          '(8, 0, 0) = {0x00000010 0x00000011}',
+                          '(9, 0, 0) = {0x00000012 0x00000013}',
+                          '(10, 0, 0) = {0x00000014 0x00000015}',
+                          '(11, 0, 0) = {0x00000016 0x00000017}'])
+
+        # uint3
+        self.try_command('language renderscript allocation dump 30',
+                         ['(0, 0, 0) = {0x00000000 0x00000001 0x00000002}',
+                          '(1, 0, 0) = {0x00000004 0x00000005 0x00000006}',
+                          '(2, 0, 0) = {0x00000008 0x00000009 0x0000000a}',
+                          '(3, 0, 0) = {0x0000000c 0x0000000d 0x0000000e}',
+                          '(4, 0, 0) = {0x00000010 0x00000011 0x00000012}',
+                          '(5, 0, 0) = {0x00000014 0x00000015 0x00000016}'])
+
+        # uint4
+        self.try_command('language renderscript allocation dump 31',
+                         ['(0, 0, 0) = {0x00000000 0x00000001 0x00000002 0x00000003}',
+                          '(0, 0, 1) = {0x00000004 0x00000005 0x00000006 0x00000007}',
+                          '(0, 0, 2) = {0x00000008 0x00000009 0x0000000a 0x0000000b}',
+                          '(0, 0, 3) = {0x0000000c 0x0000000d 0x0000000e 0x0000000f}',
+                          '(0, 0, 4) = {0x00000010 0x00000011 0x00000012 0x00000013}',
+                          '(0, 0, 5) = {0x00000014 0x00000015 0x00000016 0x00000017}'])
+
+        # ulong
+        self.try_command('language renderscript allocation dump 32',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(0, 1, 0) = 4',
+                          '(1, 1, 0) = 5',
+                          '(2, 1, 0) = 6',
+                          '(3, 1, 0) = 7',
+                          '(0, 2, 0) = 8',
+                          '(1, 2, 0) = 9',
+                          '(2, 2, 0) = 10',
+                          '(3, 2, 0) = 11',
+                          '(0, 0, 1) = 12',
+                          '(1, 0, 1) = 13',
+                          '(2, 0, 1) = 14',
+                          '(3, 0, 1) = 15',
+                          '(0, 1, 1) = 16',
+                          '(1, 1, 1) = 17',
+                          '(2, 1, 1) = 18',
+                          '(3, 1, 1) = 19',
+                          '(0, 2, 1) = 20',
+                          '(1, 2, 1) = 21',
+                          '(2, 2, 1) = 22',
+                          '(3, 2, 1) = 23'])
+
+        # ulong2
+        self.try_command('language renderscript allocation dump 33',
+                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001}',
+                          '(1, 0, 0) = {0x0000000000000002 0x0000000000000003}',
+                          '(2, 0, 0) = {0x0000000000000004 0x0000000000000005}',
+                          '(3, 0, 0) = {0x0000000000000006 0x0000000000000007}',
+                          '(4, 0, 0) = {0x0000000000000008 0x0000000000000009}',
+                          '(5, 0, 0) = {0x000000000000000a 0x000000000000000b}',
+                          '(6, 0, 0) = {0x000000000000000c 0x000000000000000d}',
+                          '(7, 0, 0) = {0x000000000000000e 0x000000000000000f}',
+                          '(8, 0, 0) = {0x0000000000000010 0x0000000000000011}',
+                          '(9, 0, 0) = {0x0000000000000012 0x0000000000000013}',
+                          '(10, 0, 0) = {0x0000000000000014 0x0000000000000015}',
+                          '(11, 0, 0) = {0x0000000000000016 0x0000000000000017}'])
+
+        # ulong3
+        self.try_command('language renderscript allocation dump 34',
+                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001 0x0000000000000002}',
+                          '(1, 0, 0) = {0x0000000000000004 0x0000000000000005 0x0000000000000006}',
+                          '(2, 0, 0) = {0x0000000000000008 0x0000000000000009 0x000000000000000a}',
+                          '(3, 0, 0) = {0x000000000000000c 0x000000000000000d 0x000000000000000e}',
+                          '(4, 0, 0) = {0x0000000000000010 0x0000000000000011 0x0000000000000012}',
+                          '(5, 0, 0) = {0x0000000000000014 0x0000000000000015 0x0000000000000016}'])
+
+        # ulong4
+        self.try_command('language renderscript allocation dump 35',
+                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001 '
+                                       '0x0000000000000002 0x0000000000000003}',
+                          '(1, 0, 0) = {0x0000000000000004 0x0000000000000005 '
+                                       '0x0000000000000006 0x0000000000000007}',
+                          '(2, 0, 0) = {0x0000000000000008 0x0000000000000009 '
+                                       '0x000000000000000a 0x000000000000000b}',
+                          '(3, 0, 0) = {0x000000000000000c 0x000000000000000d '
+                                       '0x000000000000000e 0x000000000000000f}',
+                          '(4, 0, 0) = {0x0000000000000010 0x0000000000000011 '
+                                       '0x0000000000000012 0x0000000000000013}',
+                          '(5, 0, 0) = {0x0000000000000014 0x0000000000000015 '
+                                       '0x0000000000000016 0x0000000000000017}'])
+
+        self.try_command('breakpoint del 2',
+                         ['1 breakpoints deleted'])
+
+        # Hit third kernel
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        # Test that uint allocation has been squared by square_kernel
+        self.try_command('language renderscript allocation dump 28',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 4',
+                          '(3, 0, 0) = 9',
+                          '(4, 0, 0) = 16',
+                          '(5, 0, 0) = 25',
+                          '(6, 0, 0) = 36',
+                          '(7, 0, 0) = 49',
+                          '(8, 0, 0) = 64',
+                          '(9, 0, 0) = 81',
+                          '(10, 0, 0) = 100',
+                          '(11, 0, 0) = 121',
+                          '(12, 0, 0) = 144',
+                          '(13, 0, 0) = 169',
+                          '(14, 0, 0) = 196',
+                          '(15, 0, 0) = 225',
+                          '(16, 0, 0) = 256',
+                          '(17, 0, 0) = 289',
+                          '(18, 0, 0) = 324',
+                          '(19, 0, 0) = 361',
+                          '(20, 0, 0) = 400',
+                          '(21, 0, 0) = 441',
+                          '(22, 0, 0) = 484',
+                          '(23, 0, 0) = 529'])
+
+        # half
+        self.try_command('language renderscript allocation dump 36',
+                         ['(0, 0, 0) = inf',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 0.5',
+                          '(3, 0, 0) = 0.333252',
+                          '(4, 0, 0) = 0.25',
+                          '(5, 0, 0) = 0.199951',
+                          '(6, 0, 0) = 0.166626',
+                          '(7, 0, 0) = 0.142822',
+                          '(8, 0, 0) = 0.125',
+                          '(9, 0, 0) = 0.111084',
+                          '(10, 0, 0) = 0.0999756',
+                          '(11, 0, 0) = 0.0908813',
+                          '(12, 0, 0) = 0.083313',
+                          '(13, 0, 0) = 0.0769043',
+                          '(14, 0, 0) = 0.0714111',
+                          '(15, 0, 0) = 0.0666504',
+                          '(16, 0, 0) = 0.0625',
+                          '(17, 0, 0) = 0.0588379',
+                          '(18, 0, 0) = 0.055542',
+                          '(19, 0, 0) = 0.0526428',
+                          '(20, 0, 0) = 0.0499878',
+                          '(21, 0, 0) = 0.0476074',
+                          '(22, 0, 0) = 0.0454407',
+                          '(23, 0, 0) = 0.0434875'])
+
+        # half2
+        self.try_command('language renderscript allocation dump 37',
+                         ['(0, 0, 0) = {inf 1}',
+                          '(1, 0, 0) = {0.5 0.333252}',
+                          '(2, 0, 0) = {0.25 0.199951}',
+                          '(3, 0, 0) = {0.166626 0.142822}',
+                          '(4, 0, 0) = {0.125 0.111084}',
+                          '(5, 0, 0) = {0.0999756 0.0908813}',
+                          '(6, 0, 0) = {0.083313 0.0769043}',
+                          '(7, 0, 0) = {0.0714111 0.0666504}',
+                          '(8, 0, 0) = {0.0625 0.0588379}',
+                          '(9, 0, 0) = {0.055542 0.0526428}',
+                          '(10, 0, 0) = {0.0499878 0.0476074}',
+                          '(11, 0, 0) = {0.0454407 0.0434875}'])
+
+        # half3
+        self.try_command('language renderscript allocation dump 38',
+                         ['(0, 0, 0) = {inf 1 0.5}',
+                          '(0, 1, 0) = {0.25 0.199951 0.166626}',
+                          '(0, 2, 0) = {0.125 0.111084 0.0999756}',
+                          '(0, 3, 0) = {0.083313 0.0769043 0.0714111}',
+                          '(0, 4, 0) = {0.0625 0.0588379 0.055542}',
+                          '(0, 5, 0) = {0.0499878 0.0476074 0.0454407}'])
+
+        # half4
+        self.try_command('language renderscript allocation dump 39',
+                         ['(0, 0, 0) = {inf 1 0.5 0.333252}',
+                          '(1, 0, 0) = {0.25 0.199951 0.166626 0.142822}',
+                          '(2, 0, 0) = {0.125 0.111084 0.0999756 0.0908813}',
+                          '(3, 0, 0) = {0.083313 0.0769043 0.0714111 0.0666504}',
+                          '(4, 0, 0) = {0.0625 0.0588379 0.055542 0.0526428}',
+                          '(5, 0, 0) = {0.0499878 0.0476074 0.0454407 0.0434875}'])
+
+        # float
+        self.try_command('language renderscript allocation dump 40',
+                         ['(0, 0, 0) = inf',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 0.5',
+                          '(3, 0, 0) = 0.333333',
+                          '(4, 0, 0) = 0.25',
+                          '(5, 0, 0) = 0.2',
+                          '(6, 0, 0) = 0.166667',
+                          '(7, 0, 0) = 0.142857',
+                          '(8, 0, 0) = 0.125',
+                          '(9, 0, 0) = 0.111111',
+                          '(10, 0, 0) = 0.1',
+                          '(11, 0, 0) = 0.0909091',
+                          '(12, 0, 0) = 0.0833333',
+                          '(13, 0, 0) = 0.0769231',
+                          '(14, 0, 0) = 0.0714286',
+                          '(15, 0, 0) = 0.0666667',
+                          '(16, 0, 0) = 0.0625',
+                          '(17, 0, 0) = 0.0588235',
+                          '(18, 0, 0) = 0.0555556',
+                          '(19, 0, 0) = 0.0526316',
+                          '(20, 0, 0) = 0.05',
+                          '(21, 0, 0) = 0.047619',
+                          '(22, 0, 0) = 0.0454545',
+                          '(23, 0, 0) = 0.0434783'])
+
+        # float2
+        self.try_command('language renderscript allocation dump 41',
+                         ['(0, 0, 0) = {inf 1}',
+                          '(1, 0, 0) = {0.5 0.333333}',
+                          '(2, 0, 0) = {0.25 0.2}',
+                          '(3, 0, 0) = {0.166667 0.142857}',
+                          '(4, 0, 0) = {0.125 0.111111}',
+                          '(5, 0, 0) = {0.1 0.0909091}',
+                          '(6, 0, 0) = {0.0833333 0.0769231}',
+                          '(7, 0, 0) = {0.0714286 0.0666667}',
+                          '(8, 0, 0) = {0.0625 0.0588235}',
+                          '(9, 0, 0) = {0.0555556 0.0526316}',
+                          '(10, 0, 0) = {0.05 0.047619}',
+                          '(11, 0, 0) = {0.0454545 0.0434783}'])
+
+        # float3
+        self.try_command('language renderscript allocation dump 42',
+                         ['(0, 0, 0) = {inf 1 0.5}',
+                          '(1, 0, 0) = {0.25 0.2 0.166667}',
+                          '(2, 0, 0) = {0.125 0.111111 0.1}',
+                          '(3, 0, 0) = {0.0833333 0.0769231 0.0714286}',
+                          '(4, 0, 0) = {0.0625 0.0588235 0.0555556}',
+                          '(5, 0, 0) = {0.05 0.047619 0.0454545}'])
+
+        # float4
+        self.try_command('language renderscript allocation dump 43',
+                         ['(0, 0, 0) = {inf 1 0.5 0.333333}',
+                          '(1, 0, 0) = {0.25 0.2 0.166667 0.142857}',
+                          '(2, 0, 0) = {0.125 0.111111 0.1 0.0909091}',
+                          '(0, 1, 0) = {0.0833333 0.0769231 0.0714286 0.0666667}',
+                          '(1, 1, 0) = {0.0625 0.0588235 0.0555556 0.0526316}',
+                          '(2, 1, 0) = {0.05 0.047619 0.0454545 0.0434783}'])
+
+        # double
+        self.try_command('language renderscript allocation dump 44',
+                         ['(0, 0, 0) = inf',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 0.5',
+                          '(3, 0, 0) = 0.333333333333333',
+                          '(4, 0, 0) = 0.25',
+                          '(5, 0, 0) = 0.2',
+                          '(6, 0, 0) = 0.166666666666667',
+                          '(7, 0, 0) = 0.142857142857143',
+                          '(8, 0, 0) = 0.125',
+                          '(9, 0, 0) = 0.111111111111111',
+                          '(10, 0, 0) = 0.1',
+                          '(11, 0, 0) = 0.0909090909090909',
+                          '(12, 0, 0) = 0.0833333333333333',
+                          '(13, 0, 0) = 0.0769230769230769',
+                          '(14, 0, 0) = 0.0714285714285714',
+                          '(15, 0, 0) = 0.0666666666666667',
+                          '(16, 0, 0) = 0.0625',
+                          '(17, 0, 0) = 0.0588235294117647',
+                          '(18, 0, 0) = 0.0555555555555556',
+                          '(19, 0, 0) = 0.0526315789473684',
+                          '(20, 0, 0) = 0.05',
+                          '(21, 0, 0) = 0.0476190476190476',
+                          '(22, 0, 0) = 0.0454545454545455',
+                          '(23, 0, 0) = 0.0434782608695652'])
+
+        # double2
+        self.try_command('language renderscript allocation dump 45',
+                         ['(0, 0, 0) = {inf 1}',
+                          '(1, 0, 0) = {0.5 0.333333333333333}',
+                          '(2, 0, 0) = {0.25 0.2}',
+                          '(3, 0, 0) = {0.166666666666667 0.142857142857143}',
+                          '(0, 0, 1) = {0.125 0.111111111111111}',
+                          '(1, 0, 1) = {0.1 0.0909090909090909}',
+                          '(2, 0, 1) = {0.0833333333333333 0.0769230769230769}',
+                          '(3, 0, 1) = {0.0714285714285714 0.0666666666666667}',
+                          '(0, 0, 2) = {0.0625 0.0588235294117647}',
+                          '(1, 0, 2) = {0.0555555555555556 0.0526315789473684}',
+                          '(2, 0, 2) = {0.05 0.0476190476190476}',
+                          '(3, 0, 2) = {0.0454545454545455 0.0434782608695652}'])
+
+        # double3
+        self.try_command('language renderscript allocation dump 46',
+                         ['(0, 0, 0) = {inf 1 0.5}',
+                          '(0, 1, 0) = {0.25 0.2 0.166666666666667}',
+                          '(0, 0, 1) = {0.125 0.111111111111111 0.1}',
+                          '(0, 1, 1) = {0.0833333333333333 0.0769230769230769 '
+                                       '0.0714285714285714}',
+                          '(0, 0, 2) = {0.0625 0.0588235294117647 0.0555555555555556}',
+                          '(0, 1, 2) = {0.05 0.0476190476190476 0.0454545454545455}'])
+
+        # double4
+        self.try_command('language renderscript allocation dump 47',
+                         ['(0, 0, 0) = {inf 1 0.5 0.333333333333333}',
+                          '(0, 1, 0) = {0.25 0.2 0.166666666666667 0.142857142857143}',
+                          '(0, 0, 1) = {0.125 0.111111111111111 0.1 0.0909090909090909}',
+                          '(0, 1, 1) = {0.0833333333333333 0.0769230769230769 '
+                                       '0.0714285714285714 0.0666666666666667}',
+                          '(0, 0, 2) = {0.0625 0.0588235294117647 '
+                                       '0.0555555555555556 0.0526315789473684}',
+                          '(0, 1, 2) = {0.05 0.0476190476190476 '
+                                       '0.0454545454545455 0.0434782608695652}'])
+
+        # Delete kernel breakpoint on add_half_kernel
+        self.try_command('breakpoint del 3',
+                         ['1 breakpoints deleted'])
+
+        self.try_command('process continue',
+                         ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_allocation_dump_2_jni.py b/tests/lldb/tests/testcases/test_allocation_dump_2_jni.py
new file mode 100644
index 0000000..41e33d3
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_allocation_dump_2_jni.py

@@ -0,0 +1,518 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestAllocationDump2JNI.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+
+
+class TestAllocationDump2JNI(TestBaseRemote):
+    '''Tests printing the contents of allocations of a JNI apk.'''
+
+    bundle_target = {
+        'jni': 'JNIAllocations'
+    }
+
+    def test_case(self):
+        '''Run the lldb commands that are being tested.
+
+        Raises:
+            TestFail: One of the lldb commands did not provide the expected
+            output.
+        '''
+        # pylint: disable=line-too-long
+        self.try_command('language renderscript kernel breakpoint all enable',
+                         ['Breakpoints will be set on all kernels'])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('breakpoint del 1',
+                         ['1 breakpoints deleted'])
+
+        # Hit second kernel
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        # uchar
+        self.try_command('language renderscript allocation dump 20',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(4, 0, 0) = 4',
+                          '(5, 0, 0) = 5',
+                          '(6, 0, 0) = 6',
+                          '(7, 0, 0) = 7',
+                          '(8, 0, 0) = 8',
+                          '(9, 0, 0) = 9',
+                          '(10, 0, 0) = 10',
+                          '(11, 0, 0) = 11',
+                          '(12, 0, 0) = 12',
+                          '(13, 0, 0) = 13',
+                          '(14, 0, 0) = 14',
+                          '(15, 0, 0) = 15',
+                          '(16, 0, 0) = 16',
+                          '(17, 0, 0) = 17',
+                          '(18, 0, 0) = 18',
+                          '(19, 0, 0) = 19',
+                          '(20, 0, 0) = 20',
+                          '(21, 0, 0) = 21',
+                          '(22, 0, 0) = 22',
+                          '(23, 0, 0) = 23'])
+
+        # uchar2
+        self.try_command('language renderscript allocation dump 21',
+                         ['(0, 0, 0) = {0x00 0x01}',
+                          '(1, 0, 0) = {0x02 0x03}',
+                          '(0, 1, 0) = {0x04 0x05}',
+                          '(1, 1, 0) = {0x06 0x07}',
+                          '(0, 2, 0) = {0x08 0x09}',
+                          '(1, 2, 0) = {0x0a 0x0b}',
+                          '(0, 3, 0) = {0x0c 0x0d}',
+                          '(1, 3, 0) = {0x0e 0x0f}',
+                          '(0, 4, 0) = {0x10 0x11}',
+                          '(1, 4, 0) = {0x12 0x13}',
+                          '(0, 5, 0) = {0x14 0x15}',
+                          '(1, 5, 0) = {0x16 0x17}'])
+
+        # uchar3
+        self.try_command('language renderscript allocation dump 22',
+                         ['(0, 0, 0) = {0x00 0x01 0x02}',
+                          '(1, 0, 0) = {0x04 0x05 0x06}',
+                          '(2, 0, 0) = {0x08 0x09 0x0a}',
+                          '(3, 0, 0) = {0x0c 0x0d 0x0e}',
+                          '(4, 0, 0) = {0x10 0x11 0x12}',
+                          '(5, 0, 0) = {0x14 0x15 0x16}'])
+
+        # uchar4
+        self.try_command('language renderscript allocation dump 23',
+                         ['(0, 0, 0) = {0x00 0x01 0x02 0x03}',
+                          '(1, 0, 0) = {0x04 0x05 0x06 0x07}',
+                          '(2, 0, 0) = {0x08 0x09 0x0a 0x0b}',
+                          '(3, 0, 0) = {0x0c 0x0d 0x0e 0x0f}',
+                          '(4, 0, 0) = {0x10 0x11 0x12 0x13}',
+                          '(5, 0, 0) = {0x14 0x15 0x16 0x17}'])
+
+        # ushort
+        self.try_command('language renderscript allocation dump 24',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(4, 0, 0) = 4',
+                          '(5, 0, 0) = 5',
+                          '(6, 0, 0) = 6',
+                          '(7, 0, 0) = 7',
+                          '(8, 0, 0) = 8',
+                          '(9, 0, 0) = 9',
+                          '(10, 0, 0) = 10',
+                          '(11, 0, 0) = 11',
+                          '(12, 0, 0) = 12',
+                          '(13, 0, 0) = 13',
+                          '(14, 0, 0) = 14',
+                          '(15, 0, 0) = 15',
+                          '(16, 0, 0) = 16',
+                          '(17, 0, 0) = 17',
+                          '(18, 0, 0) = 18',
+                          '(19, 0, 0) = 19',
+                          '(20, 0, 0) = 20',
+                          '(21, 0, 0) = 21',
+                          '(22, 0, 0) = 22',
+                          '(23, 0, 0) = 23'])
+
+        # ushort2
+        self.try_command('language renderscript allocation dump 25',
+                         ['(0, 0, 0) = {0x0000 0x0001}',
+                          '(1, 0, 0) = {0x0002 0x0003}',
+                          '(2, 0, 0) = {0x0004 0x0005}',
+                          '(3, 0, 0) = {0x0006 0x0007}',
+                          '(4, 0, 0) = {0x0008 0x0009}',
+                          '(5, 0, 0) = {0x000a 0x000b}',
+                          '(6, 0, 0) = {0x000c 0x000d}',
+                          '(7, 0, 0) = {0x000e 0x000f}',
+                          '(8, 0, 0) = {0x0010 0x0011}',
+                          '(9, 0, 0) = {0x0012 0x0013}',
+                          '(10, 0, 0) = {0x0014 0x0015}',
+                          '(11, 0, 0) = {0x0016 0x0017}'])
+
+        # ushort3
+        self.try_command('language renderscript allocation dump 26',
+                         ['(0, 0, 0) = {0x0000 0x0001 0x0002}',
+                          '(0, 1, 0) = {0x0004 0x0005 0x0006}',
+                          '(0, 2, 0) = {0x0008 0x0009 0x000a}',
+                          '(0, 3, 0) = {0x000c 0x000d 0x000e}',
+                          '(0, 4, 0) = {0x0010 0x0011 0x0012}',
+                          '(0, 5, 0) = {0x0014 0x0015 0x0016}'])
+
+        # ushort4
+        self.try_command('language renderscript allocation dump 27',
+                         ['(0, 0, 0) = {0x0000 0x0001 0x0002 0x0003}',
+                          '(1, 0, 0) = {0x0004 0x0005 0x0006 0x0007}',
+                          '(2, 0, 0) = {0x0008 0x0009 0x000a 0x000b}',
+                          '(3, 0, 0) = {0x000c 0x000d 0x000e 0x000f}',
+                          '(4, 0, 0) = {0x0010 0x0011 0x0012 0x0013}',
+                          '(5, 0, 0) = {0x0014 0x0015 0x0016 0x0017}'])
+
+        # uint
+        self.try_command('language renderscript allocation dump 28',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(4, 0, 0) = 4',
+                          '(5, 0, 0) = 5',
+                          '(6, 0, 0) = 6',
+                          '(7, 0, 0) = 7',
+                          '(8, 0, 0) = 8',
+                          '(9, 0, 0) = 9',
+                          '(10, 0, 0) = 10',
+                          '(11, 0, 0) = 11',
+                          '(12, 0, 0) = 12',
+                          '(13, 0, 0) = 13',
+                          '(14, 0, 0) = 14',
+                          '(15, 0, 0) = 15',
+                          '(16, 0, 0) = 16',
+                          '(17, 0, 0) = 17',
+                          '(18, 0, 0) = 18',
+                          '(19, 0, 0) = 19',
+                          '(20, 0, 0) = 20',
+                          '(21, 0, 0) = 21',
+                          '(22, 0, 0) = 22',
+                          '(23, 0, 0) = 23'])
+
+        # uint2
+        self.try_command('language renderscript allocation dump 29',
+                         ['(0, 0, 0) = {0x00000000 0x00000001}',
+                          '(1, 0, 0) = {0x00000002 0x00000003}',
+                          '(2, 0, 0) = {0x00000004 0x00000005}',
+                          '(3, 0, 0) = {0x00000006 0x00000007}',
+                          '(4, 0, 0) = {0x00000008 0x00000009}',
+                          '(5, 0, 0) = {0x0000000a 0x0000000b}',
+                          '(6, 0, 0) = {0x0000000c 0x0000000d}',
+                          '(7, 0, 0) = {0x0000000e 0x0000000f}',
+                          '(8, 0, 0) = {0x00000010 0x00000011}',
+                          '(9, 0, 0) = {0x00000012 0x00000013}',
+                          '(10, 0, 0) = {0x00000014 0x00000015}',
+                          '(11, 0, 0) = {0x00000016 0x00000017}'])
+
+        # uint3
+        self.try_command('language renderscript allocation dump 30',
+                         ['(0, 0, 0) = {0x00000000 0x00000001 0x00000002}',
+                          '(1, 0, 0) = {0x00000004 0x00000005 0x00000006}',
+                          '(2, 0, 0) = {0x00000008 0x00000009 0x0000000a}',
+                          '(3, 0, 0) = {0x0000000c 0x0000000d 0x0000000e}',
+                          '(4, 0, 0) = {0x00000010 0x00000011 0x00000012}',
+                          '(5, 0, 0) = {0x00000014 0x00000015 0x00000016}'])
+
+        # uint4
+        self.try_command('language renderscript allocation dump 31',
+                         ['(0, 0, 0) = {0x00000000 0x00000001 0x00000002 0x00000003}',
+                          '(0, 0, 1) = {0x00000004 0x00000005 0x00000006 0x00000007}',
+                          '(0, 0, 2) = {0x00000008 0x00000009 0x0000000a 0x0000000b}',
+                          '(0, 0, 3) = {0x0000000c 0x0000000d 0x0000000e 0x0000000f}',
+                          '(0, 0, 4) = {0x00000010 0x00000011 0x00000012 0x00000013}',
+                          '(0, 0, 5) = {0x00000014 0x00000015 0x00000016 0x00000017}'])
+
+        # ulong
+        self.try_command('language renderscript allocation dump 32',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(0, 1, 0) = 4',
+                          '(1, 1, 0) = 5',
+                          '(2, 1, 0) = 6',
+                          '(3, 1, 0) = 7',
+                          '(0, 2, 0) = 8',
+                          '(1, 2, 0) = 9',
+                          '(2, 2, 0) = 10',
+                          '(3, 2, 0) = 11',
+                          '(0, 0, 1) = 12',
+                          '(1, 0, 1) = 13',
+                          '(2, 0, 1) = 14',
+                          '(3, 0, 1) = 15',
+                          '(0, 1, 1) = 16',
+                          '(1, 1, 1) = 17',
+                          '(2, 1, 1) = 18',
+                          '(3, 1, 1) = 19',
+                          '(0, 2, 1) = 20',
+                          '(1, 2, 1) = 21',
+                          '(2, 2, 1) = 22',
+                          '(3, 2, 1) = 23'])
+
+        # ulong2
+        self.try_command('language renderscript allocation dump 33',
+                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001}',
+                          '(1, 0, 0) = {0x0000000000000002 0x0000000000000003}',
+                          '(2, 0, 0) = {0x0000000000000004 0x0000000000000005}',
+                          '(3, 0, 0) = {0x0000000000000006 0x0000000000000007}',
+                          '(4, 0, 0) = {0x0000000000000008 0x0000000000000009}',
+                          '(5, 0, 0) = {0x000000000000000a 0x000000000000000b}',
+                          '(6, 0, 0) = {0x000000000000000c 0x000000000000000d}',
+                          '(7, 0, 0) = {0x000000000000000e 0x000000000000000f}',
+                          '(8, 0, 0) = {0x0000000000000010 0x0000000000000011}',
+                          '(9, 0, 0) = {0x0000000000000012 0x0000000000000013}',
+                          '(10, 0, 0) = {0x0000000000000014 0x0000000000000015}',
+                          '(11, 0, 0) = {0x0000000000000016 0x0000000000000017}'])
+
+        # ulong3
+        self.try_command('language renderscript allocation dump 34',
+                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001 0x0000000000000002}',
+                          '(1, 0, 0) = {0x0000000000000004 0x0000000000000005 0x0000000000000006}',
+                          '(2, 0, 0) = {0x0000000000000008 0x0000000000000009 0x000000000000000a}',
+                          '(3, 0, 0) = {0x000000000000000c 0x000000000000000d 0x000000000000000e}',
+                          '(4, 0, 0) = {0x0000000000000010 0x0000000000000011 0x0000000000000012}',
+                          '(5, 0, 0) = {0x0000000000000014 0x0000000000000015 0x0000000000000016}'])
+
+        # ulong4
+        self.try_command('language renderscript allocation dump 35',
+                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001 '
+                                       '0x0000000000000002 0x0000000000000003}',
+                          '(1, 0, 0) = {0x0000000000000004 0x0000000000000005 '
+                                       '0x0000000000000006 0x0000000000000007}',
+                          '(2, 0, 0) = {0x0000000000000008 0x0000000000000009 '
+                                       '0x000000000000000a 0x000000000000000b}',
+                          '(3, 0, 0) = {0x000000000000000c 0x000000000000000d '
+                                       '0x000000000000000e 0x000000000000000f}',
+                          '(4, 0, 0) = {0x0000000000000010 0x0000000000000011 '
+                                       '0x0000000000000012 0x0000000000000013}',
+                          '(5, 0, 0) = {0x0000000000000014 0x0000000000000015 '
+                                       '0x0000000000000016 0x0000000000000017}'])
+
+        self.try_command('breakpoint del 2',
+                         ['1 breakpoints deleted'])
+
+        # Hit third kernel
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        # Test that uint allocation has been squared by square_kernel
+        self.try_command('language renderscript allocation dump 28',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 4',
+                          '(3, 0, 0) = 9',
+                          '(4, 0, 0) = 16',
+                          '(5, 0, 0) = 25',
+                          '(6, 0, 0) = 36',
+                          '(7, 0, 0) = 49',
+                          '(8, 0, 0) = 64',
+                          '(9, 0, 0) = 81',
+                          '(10, 0, 0) = 100',
+                          '(11, 0, 0) = 121',
+                          '(12, 0, 0) = 144',
+                          '(13, 0, 0) = 169',
+                          '(14, 0, 0) = 196',
+                          '(15, 0, 0) = 225',
+                          '(16, 0, 0) = 256',
+                          '(17, 0, 0) = 289',
+                          '(18, 0, 0) = 324',
+                          '(19, 0, 0) = 361',
+                          '(20, 0, 0) = 400',
+                          '(21, 0, 0) = 441',
+                          '(22, 0, 0) = 484',
+                          '(23, 0, 0) = 529'])
+
+        # half
+        self.try_command('language renderscript allocation dump 36',
+                         ['(0, 0, 0) = inf',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 0.5',
+                          '(3, 0, 0) = 0.333252',
+                          '(4, 0, 0) = 0.25',
+                          '(5, 0, 0) = 0.199951',
+                          '(6, 0, 0) = 0.166626',
+                          '(7, 0, 0) = 0.142822',
+                          '(8, 0, 0) = 0.125',
+                          '(9, 0, 0) = 0.111084',
+                          '(10, 0, 0) = 0.0999756',
+                          '(11, 0, 0) = 0.0908813',
+                          '(12, 0, 0) = 0.083313',
+                          '(13, 0, 0) = 0.0769043',
+                          '(14, 0, 0) = 0.0714111',
+                          '(15, 0, 0) = 0.0666504',
+                          '(16, 0, 0) = 0.0625',
+                          '(17, 0, 0) = 0.0588379',
+                          '(18, 0, 0) = 0.055542',
+                          '(19, 0, 0) = 0.0526428',
+                          '(20, 0, 0) = 0.0499878',
+                          '(21, 0, 0) = 0.0476074',
+                          '(22, 0, 0) = 0.0454407',
+                          '(23, 0, 0) = 0.0434875'])
+
+        # half2
+        self.try_command('language renderscript allocation dump 37',
+                         ['(0, 0, 0) = {inf 1}',
+                          '(1, 0, 0) = {0.5 0.333252}',
+                          '(2, 0, 0) = {0.25 0.199951}',
+                          '(3, 0, 0) = {0.166626 0.142822}',
+                          '(4, 0, 0) = {0.125 0.111084}',
+                          '(5, 0, 0) = {0.0999756 0.0908813}',
+                          '(6, 0, 0) = {0.083313 0.0769043}',
+                          '(7, 0, 0) = {0.0714111 0.0666504}',
+                          '(8, 0, 0) = {0.0625 0.0588379}',
+                          '(9, 0, 0) = {0.055542 0.0526428}',
+                          '(10, 0, 0) = {0.0499878 0.0476074}',
+                          '(11, 0, 0) = {0.0454407 0.0434875}'])
+
+        # half3
+        self.try_command('language renderscript allocation dump 38',
+                         ['(0, 0, 0) = {inf 1 0.5}',
+                          '(0, 1, 0) = {0.25 0.199951 0.166626}',
+                          '(0, 2, 0) = {0.125 0.111084 0.0999756}',
+                          '(0, 3, 0) = {0.083313 0.0769043 0.0714111}',
+                          '(0, 4, 0) = {0.0625 0.0588379 0.055542}',
+                          '(0, 5, 0) = {0.0499878 0.0476074 0.0454407}'])
+
+        # half4
+        self.try_command('language renderscript allocation dump 39',
+                         ['(0, 0, 0) = {inf 1 0.5 0.333252}',
+                          '(1, 0, 0) = {0.25 0.199951 0.166626 0.142822}',
+                          '(2, 0, 0) = {0.125 0.111084 0.0999756 0.0908813}',
+                          '(3, 0, 0) = {0.083313 0.0769043 0.0714111 0.0666504}',
+                          '(4, 0, 0) = {0.0625 0.0588379 0.055542 0.0526428}',
+                          '(5, 0, 0) = {0.0499878 0.0476074 0.0454407 0.0434875}'])
+
+        # float
+        self.try_command('language renderscript allocation dump 40',
+                         ['(0, 0, 0) = inf',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 0.5',
+                          '(3, 0, 0) = 0.333333',
+                          '(4, 0, 0) = 0.25',
+                          '(5, 0, 0) = 0.2',
+                          '(6, 0, 0) = 0.166667',
+                          '(7, 0, 0) = 0.142857',
+                          '(8, 0, 0) = 0.125',
+                          '(9, 0, 0) = 0.111111',
+                          '(10, 0, 0) = 0.1',
+                          '(11, 0, 0) = 0.0909091',
+                          '(12, 0, 0) = 0.0833333',
+                          '(13, 0, 0) = 0.0769231',
+                          '(14, 0, 0) = 0.0714286',
+                          '(15, 0, 0) = 0.0666667',
+                          '(16, 0, 0) = 0.0625',
+                          '(17, 0, 0) = 0.0588235',
+                          '(18, 0, 0) = 0.0555556',
+                          '(19, 0, 0) = 0.0526316',
+                          '(20, 0, 0) = 0.05',
+                          '(21, 0, 0) = 0.047619',
+                          '(22, 0, 0) = 0.0454545',
+                          '(23, 0, 0) = 0.0434783'])
+
+        # float2
+        self.try_command('language renderscript allocation dump 41',
+                         ['(0, 0, 0) = {inf 1}',
+                          '(1, 0, 0) = {0.5 0.333333}',
+                          '(2, 0, 0) = {0.25 0.2}',
+                          '(3, 0, 0) = {0.166667 0.142857}',
+                          '(4, 0, 0) = {0.125 0.111111}',
+                          '(5, 0, 0) = {0.1 0.0909091}',
+                          '(6, 0, 0) = {0.0833333 0.0769231}',
+                          '(7, 0, 0) = {0.0714286 0.0666667}',
+                          '(8, 0, 0) = {0.0625 0.0588235}',
+                          '(9, 0, 0) = {0.0555556 0.0526316}',
+                          '(10, 0, 0) = {0.05 0.047619}',
+                          '(11, 0, 0) = {0.0454545 0.0434783}'])
+
+        # float3
+        self.try_command('language renderscript allocation dump 42',
+                         ['(0, 0, 0) = {inf 1 0.5}',
+                          '(1, 0, 0) = {0.25 0.2 0.166667}',
+                          '(2, 0, 0) = {0.125 0.111111 0.1}',
+                          '(3, 0, 0) = {0.0833333 0.0769231 0.0714286}',
+                          '(4, 0, 0) = {0.0625 0.0588235 0.0555556}',
+                          '(5, 0, 0) = {0.05 0.047619 0.0454545}'])
+
+        # float4
+        self.try_command('language renderscript allocation dump 43',
+                         ['(0, 0, 0) = {inf 1 0.5 0.333333}',
+                          '(1, 0, 0) = {0.25 0.2 0.166667 0.142857}',
+                          '(2, 0, 0) = {0.125 0.111111 0.1 0.0909091}',
+                          '(0, 1, 0) = {0.0833333 0.0769231 0.0714286 0.0666667}',
+                          '(1, 1, 0) = {0.0625 0.0588235 0.0555556 0.0526316}',
+                          '(2, 1, 0) = {0.05 0.047619 0.0454545 0.0434783}'])
+
+        # double
+        self.try_command('language renderscript allocation dump 44',
+                         ['(0, 0, 0) = inf',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 0.5',
+                          '(3, 0, 0) = 0.333333333333333',
+                          '(4, 0, 0) = 0.25',
+                          '(5, 0, 0) = 0.2',
+                          '(6, 0, 0) = 0.166666666666667',
+                          '(7, 0, 0) = 0.142857142857143',
+                          '(8, 0, 0) = 0.125',
+                          '(9, 0, 0) = 0.111111111111111',
+                          '(10, 0, 0) = 0.1',
+                          '(11, 0, 0) = 0.0909090909090909',
+                          '(12, 0, 0) = 0.0833333333333333',
+                          '(13, 0, 0) = 0.0769230769230769',
+                          '(14, 0, 0) = 0.0714285714285714',
+                          '(15, 0, 0) = 0.0666666666666667',
+                          '(16, 0, 0) = 0.0625',
+                          '(17, 0, 0) = 0.0588235294117647',
+                          '(18, 0, 0) = 0.0555555555555556',
+                          '(19, 0, 0) = 0.0526315789473684',
+                          '(20, 0, 0) = 0.05',
+                          '(21, 0, 0) = 0.0476190476190476',
+                          '(22, 0, 0) = 0.0454545454545455',
+                          '(23, 0, 0) = 0.0434782608695652'])
+
+        # double2
+        self.try_command('language renderscript allocation dump 45',
+                         ['(0, 0, 0) = {inf 1}',
+                          '(1, 0, 0) = {0.5 0.333333333333333}',
+                          '(2, 0, 0) = {0.25 0.2}',
+                          '(3, 0, 0) = {0.166666666666667 0.142857142857143}',
+                          '(0, 0, 1) = {0.125 0.111111111111111}',
+                          '(1, 0, 1) = {0.1 0.0909090909090909}',
+                          '(2, 0, 1) = {0.0833333333333333 0.0769230769230769}',
+                          '(3, 0, 1) = {0.0714285714285714 0.0666666666666667}',
+                          '(0, 0, 2) = {0.0625 0.0588235294117647}',
+                          '(1, 0, 2) = {0.0555555555555556 0.0526315789473684}',
+                          '(2, 0, 2) = {0.05 0.0476190476190476}',
+                          '(3, 0, 2) = {0.0454545454545455 0.0434782608695652}'])
+
+        # double3
+        self.try_command('language renderscript allocation dump 46',
+                         ['(0, 0, 0) = {inf 1 0.5}',
+                          '(0, 1, 0) = {0.25 0.2 0.166666666666667}',
+                          '(0, 0, 1) = {0.125 0.111111111111111 0.1}',
+                          '(0, 1, 1) = {0.0833333333333333 0.0769230769230769 '
+                                       '0.0714285714285714}',
+                          '(0, 0, 2) = {0.0625 0.0588235294117647 0.0555555555555556}',
+                          '(0, 1, 2) = {0.05 0.0476190476190476 0.0454545454545455}'])
+
+        # double4
+        self.try_command('language renderscript allocation dump 47',
+                         ['(0, 0, 0) = {inf 1 0.5 0.333333333333333}',
+                          '(0, 1, 0) = {0.25 0.2 0.166666666666667 0.142857142857143}',
+                          '(0, 0, 1) = {0.125 0.111111111111111 0.1 0.0909090909090909}',
+                          '(0, 1, 1) = {0.0833333333333333 0.0769230769230769 '
+                                       '0.0714285714285714 0.0666666666666667}',
+                          '(0, 0, 2) = {0.0625 0.0588235294117647 '
+                                       '0.0555555555555556 0.0526315789473684}',
+                          '(0, 1, 2) = {0.05 0.0476190476190476 '
+                                       '0.0454545454545455 0.0434782608695652}'])

diff --git a/tests/lldb/tests/testcases/test_allocation_dump_struct.py b/tests/lldb/tests/testcases/test_allocation_dump_struct.py
new file mode 100644
index 0000000..f161131
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_allocation_dump_struct.py

@@ -0,0 +1,165 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestAllocationDumpStruct.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+
+
+class TestAllocationDumpStruct(TestBaseRemote):
+    '''Tests printing the contents of a struct allocation.'''
+
+    bundle_target = {
+        'java': 'Allocations'
+    }
+
+    def setup(self, android):
+        '''This test requires to be run on one thread.'''
+        android.push_prop('debug.rs.max-threads', 1)
+
+    def teardown(self, android):
+        '''Reset the number of RS threads to the previous value.'''
+        android.pop_prop('debug.rs.max-threads')
+
+    def test_dump_complex_struct_allocation(self):
+        # Hit struct_kernel on last coordinate, so almost all elements have been initalised
+        self.try_command(
+            'language renderscript kernel breakpoint set struct_kernel -c 23',
+            ['Conditional kernel breakpoint on coordinate (23, 0, 0)',
+            'Breakpoint(s) created'])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        # complex_struct output allocation
+        self.try_command('language renderscript allocation dump 49',
+                         ['(0, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 0, j = 0)\n'
+                          '   (0x00, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 0, [1] = 0.5)\n'
+                          '}',
+                          '(1, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 1, j = 1)\n'
+                          '   (0x01, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 1, [1] = 1.5)\n'
+                          '}',
+                          '(2, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 2, j = 2)\n'
+                          '   (0x02, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 2, [1] = 2.5)\n'
+                          '}',
+                          '(3, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 3, j = 3)\n'
+                          '   (0x03, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 3, [1] = 3.5)\n'
+                          '}',
+                          '(4, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 4, j = 4)\n'
+                          '   (0x04, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 4, [1] = 4.5)\n'
+                          '}',
+                          '(5, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 5, j = 5)\n'
+                          '   (0x05, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 5, [1] = 5.5)\n'
+                          '}',
+                          '(6, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 6, j = 6)\n'
+                          '   (0x06, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 6, [1] = 6.5)\n'
+                          '}',
+                          '(7, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 7, j = 7)\n'
+                          '   (0x07, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 7, [1] = 7.5)\n'
+                          '}',
+                          '(8, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 8, j = 8)\n'
+                          '   (0x08, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 8, [1] = 8.5)\n'
+                          '}',
+                          '(9, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 9, j = 9)\n'
+                          '   (0x09, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 9, [1] = 9.5)\n'
+                          '}',
+                          '(10, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 10, j = 10)\n'
+                          '   (0x0a, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 10, [1] = 10.5)\n'
+                          '}',
+                          '(11, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 11, j = 11)\n'
+                          '   (0x0b, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 11, [1] = 11.5)\n'
+                          '}',
+                          '(12, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 12, j = 12)\n'
+                          '   (0x0c, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 12, [1] = 12.5)\n'
+                          '}',
+                          '(13, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 13, j = 13)\n'
+                          '   (0x0d, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 13, [1] = 13.5)\n'
+                          '}',
+                          '(14, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 14, j = 14)\n'
+                          '   (0x0e, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 14, [1] = 14.5)\n'
+                          '}',
+                          '(15, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 15, j = 15)\n'
+                          '   (0x0f, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 15, [1] = 15.5)\n'
+                          '}',
+                          '(16, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 16, j = 16)\n'
+                          '   (0x10, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 16, [1] = 16.5)\n'
+                          '}',
+                          '(17, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 17, j = 17)\n'
+                          '   (0x11, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 17, [1] = 17.5)\n'
+                          '}',
+                          '(18, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 18, j = 18)\n'
+                          '   (0x12, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 18, [1] = 18.5)\n'
+                          '}',
+                          '(19, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 19, j = 19)\n'
+                          '   (0x13, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 19, [1] = 19.5)\n'
+                          '}',
+                          '(20, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 20, j = 20)\n'
+                          '   (0x14, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 20, [1] = 20.5)\n'
+                          '}',
+                          '(21, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 21, j = 21)\n'
+                          '   (0x15, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 21, [1] = 21.5)\n'
+                          '}',
+                          '(22, 0, 0) = (complexStruct)  {\n'
+                          '   (i = 22, j = 22)\n'
+                          '   (0x16, 0x41, 0x42, 0x43)\n'
+                          '   ([0] = 22, [1] = 22.5)\n'
+                          '}'])

diff --git a/tests/lldb/tests/testcases/test_allocation_file.py b/tests/lldb/tests/testcases/test_allocation_file.py
new file mode 100644
index 0000000..680ae4a
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_allocation_file.py

@@ -0,0 +1,210 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestAllocationFile.'''
+
+from __future__ import absolute_import
+
+import os
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    cpp_only_test,
+    ordered_test
+)
+
+
+class TestAllocationFile(TestBaseRemote):
+    '''Tests saving the contents of allocations to disk and reloading them.'''
+
+    bundle_target = {
+        'java': 'Allocations',
+        'cpp': 'CppAllocations',
+        'jni': 'JNIAllocations'
+    }
+
+    @ordered_test(0)
+    def test_allocation_file_roundtrip(self):
+        self.try_command('language renderscript kernel breakpoint all enable',
+                         ['Breakpoints will be set on all kernels'])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        # Binary file of int2 allocation
+        file_int2 = self.get_tmp_file_path()
+
+        self.try_command('language renderscript allocation save 12 ' +
+                         file_int2,
+                         ["Allocation written to file '%s'" % file_int2])
+
+        # Check file was created
+        self.assert_true(os.path.isfile(file_int2))
+
+        # Load the file we just created, to assert the allocation contents are
+        # the same
+        self.try_command('language renderscript allocation load 12 ' +
+                         file_int2,
+                         ["Contents of file '%s' read into allocation 12" %
+                          file_int2])
+        os.remove(file_int2)
+
+        self.try_command('language renderscript allocation dump 12',
+                         ['(0, 0, 0) = {0 1}',
+                          '(1, 0, 0) = {2 3}',
+                          '(2, 0, 0) = {4 5}',
+                          '(3, 0, 0) = {6 7}',
+                          '(4, 0, 0) = {8 9}',
+                          '(5, 0, 0) = {10 11}',
+                          '(6, 0, 0) = {12 13}',
+                          '(7, 0, 0) = {14 15}',
+                          '(8, 0, 0) = {16 17}',
+                          '(9, 0, 0) = {18 19}',
+                          '(10, 0, 0) = {20 21}',
+                          '(11, 0, 0) = {22 23}'])
+
+        self.try_command('breakpoint del 1',
+                         ['1 breakpoints deleted'])
+
+        # Hit second kernel
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        # Binary file of uint allocation
+        file_uint = self.get_tmp_file_path()
+
+        self.try_command('language renderscript allocation save 28 ' +
+                         file_uint,
+                         ["Allocation written to file '%s'" % file_uint])
+
+        # Check file was created
+        self.assert_true(os.path.isfile(file_uint))
+
+        # Test loading file into allocation with an incompatible type 'short'
+        self.try_command('language renderscript allocation load 7 ' + file_uint,
+                         ["Contents of file '%s' read into allocation 7" %
+                          file_uint,
+                          "Warning: Mismatched Element sizes",
+                          "Warning: Mismatched Types",
+                          "Warning: Mismatched allocation sizes"])
+
+        # Check result of size inconsistency, mapping 4-byte unsigned to 2-byte
+        # int
+        self.try_command('language renderscript allocation dump 7',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 0',
+                          '(2, 0, 0) = 1',
+                          '(3, 0, 0) = 0',
+                          '(4, 0, 0) = 2',
+                          '(5, 0, 0) = 0',
+                          '(6, 0, 0) = 3',
+                          '(7, 0, 0) = 0',
+                          '(8, 0, 0) = 4',
+                          '(9, 0, 0) = 0',
+                          '(10, 0, 0) = 5',
+                          '(11, 0, 0) = 0',
+                          '(12, 0, 0) = 6',
+                          '(13, 0, 0) = 0',
+                          '(14, 0, 0) = 7',
+                          '(15, 0, 0) = 0',
+                          '(16, 0, 0) = 8',
+                          '(17, 0, 0) = 0',
+                          '(18, 0, 0) = 9',
+                          '(19, 0, 0) = 0',
+                          '(20, 0, 0) = 10',
+                          '(21, 0, 0) = 0',
+                          '(22, 0, 0) = 11',
+                          '(23, 0, 0) = 0'])
+
+        self.try_command('breakpoint del 2',
+                         ['1 breakpoints deleted'])
+
+        # Hit third kernel
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        # Test that uint allocation has been squared by square_kernel
+        self.try_command('language renderscript allocation dump 28',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 4',
+                          '(3, 0, 0) = 9',
+                          '(4, 0, 0) = 16',
+                          '(5, 0, 0) = 25',
+                          '(6, 0, 0) = 36',
+                          '(7, 0, 0) = 49',
+                          '(8, 0, 0) = 64',
+                          '(9, 0, 0) = 81',
+                          '(10, 0, 0) = 100',
+                          '(11, 0, 0) = 121',
+                          '(12, 0, 0) = 144',
+                          '(13, 0, 0) = 169',
+                          '(14, 0, 0) = 196',
+                          '(15, 0, 0) = 225',
+                          '(16, 0, 0) = 256',
+                          '(17, 0, 0) = 289',
+                          '(18, 0, 0) = 324',
+                          '(19, 0, 0) = 361',
+                          '(20, 0, 0) = 400',
+                          '(21, 0, 0) = 441',
+                          '(22, 0, 0) = 484',
+                          '(23, 0, 0) = 529'])
+
+        # Load uint allocation from save before square_kernel had been run
+        self.try_command('language renderscript allocation load 28 ' +
+                         file_uint,
+                         ["Contents of file '%s' read into allocation 28" %
+                          file_uint])
+        os.remove(file_uint)
+
+        # Check contents are back to original
+        self.try_command('language renderscript allocation dump 28',
+                         ['(0, 0, 0) = 0',
+                          '(1, 0, 0) = 1',
+                          '(2, 0, 0) = 2',
+                          '(3, 0, 0) = 3',
+                          '(4, 0, 0) = 4',
+                          '(5, 0, 0) = 5',
+                          '(6, 0, 0) = 6',
+                          '(7, 0, 0) = 7',
+                          '(8, 0, 0) = 8',
+                          '(9, 0, 0) = 9',
+                          '(10, 0, 0) = 10',
+                          '(11, 0, 0) = 11',
+                          '(12, 0, 0) = 12',
+                          '(13, 0, 0) = 13',
+                          '(14, 0, 0) = 14',
+                          '(15, 0, 0) = 15',
+                          '(16, 0, 0) = 16',
+                          '(17, 0, 0) = 17',
+                          '(18, 0, 0) = 18',
+                          '(19, 0, 0) = 19',
+                          '(20, 0, 0) = 20',
+                          '(21, 0, 0) = 21',
+                          '(22, 0, 0) = 22',
+                          '(23, 0, 0) = 23'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 3', ['1 breakpoints deleted'])
+
+        self.try_command('process continue',
+                         ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_allocation_list.py b/tests/lldb/tests/testcases/test_allocation_list.py
new file mode 100644
index 0000000..ca492f2
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_allocation_list.py

@@ -0,0 +1,547 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestAllocationList.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    wimpy,
+    cpp_only_test,
+    java_only_test,
+)
+
+
+class TestAllocationList(TestBaseRemote):
+    '''Tests printing the details of all allocations.'''
+
+    bundle_target = {
+        'java': 'Allocations',
+        'jni': 'JNIAllocations',
+        'cpp': 'CppAllocations'
+    }
+
+    @wimpy
+    @ordered_test(0)
+    def test_allocation_list_single(self):
+        # pylint: disable=anomalous-backslash-in-string
+        self.try_command('language renderscript kernel breakpoint all enable',
+                         ['Breakpoints will be set on all kernels'])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        # Test command line flag for single allocation
+        self.try_command('language renderscript allocation list -i 3',
+                         [],
+                         ['3:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(1, 3, 8\)\n'
+                          '    Data Type: char\n'
+                          '    Data Kind: User'])
+
+    @ordered_test(1)
+    def test_allocation_list_all(self):
+        self.try_command('language renderscript allocation list',
+                         [],
+                         ['1:\n'
+                         # Regex for non zero hex number
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(64, 64, 0\)\n'
+                          '    Data Type: uchar4\n'
+                          '    Data Kind: RGBA Pixel',
+                          '2:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(64, 64, 0\)\n'
+                          '    Data Type: uchar4\n'
+                          '    Data Kind: RGBA Pixel',
+                          '3:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(1, 3, 8\)\n'
+                          '    Data Type: char\n'
+                          '    Data Kind: User',
+                          '4:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(12, 0, 0\)\n'
+                          '    Data Type: char2\n'
+                          '    Data Kind: User',
+                          '5:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: char3\n'
+                          '    Data Kind: User',
+                          '6:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: char4\n'
+                          '    Data Kind: User',
+                          '7:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: short\n'
+                          '    Data Kind: User',
+                          '8:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 1, 2\)\n'
+                          '    Data Type: short2\n'
+                          '    Data Kind: User',
+                          '9:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: short3\n'
+                          '    Data Kind: User',
+                          '10:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: short4\n'
+                          '    Data Kind: User',
+                          '11:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: int\n'
+                          '    Data Kind: User',
+                          '12:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(12, 0, 0\)\n'
+                          '    Data Type: int2\n'
+                          '    Data Kind: User',
+                          '13:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(3, 2, 0\)\n'
+                          '    Data Type: int3\n'
+                          '    Data Kind: User',
+                          '14:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: int4\n'
+                          '    Data Kind: User',
+                          '15:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: long\n'
+                          '    Data Kind: User',
+                          '16:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(12, 0, 0\)\n'
+                          '    Data Type: long2\n'
+                          '    Data Kind: User',
+                          '17:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: long3\n'
+                          '    Data Kind: User',
+                          '18:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(1, 6, 0\)\n'
+                          '    Data Type: long4\n'
+                          '    Data Kind: User',
+                          '19:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: bool\n'
+                          '    Data Kind: User'
+                         ])
+
+    @wimpy
+    @ordered_test(2)
+    def test_continue_1(self):
+        self.try_command('breakpoint del 1',
+                          ['1 breakpoints deleted'])
+
+        # Hit second kernel
+        self.try_command('process continue',
+                          ['resuming',
+                           'stopped',
+                           'stop reason = breakpoint'])
+
+    @ordered_test(3)
+    def test_allocation_list_all2_java(self):
+        # TODO investigate why java tests show extra allocations
+        if self.app_type == 'java':
+            allocation_1_re = [
+                '1:\n'
+                # Regex for non zero hex number
+                '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                '    Dimensions: \(64, 64, 0\)\n'
+                '    Data Type: uchar4\n'
+                '    Data Kind: RGBA Pixel'
+            ]
+        else:
+            allocation_1_re = []
+
+        self.try_command('language renderscript allocation list',
+                         [],
+                         allocation_1_re +
+                         ['2:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(64, 64, 0\)\n'
+                          '    Data Type: uchar4\n'
+                          '    Data Kind: RGBA Pixel',
+                          '7:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: short\n'
+                          '    Data Kind: User',
+                          '20:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: uchar\n'
+                          '    Data Kind: User',
+                          '21:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(2, 6, 0\)\n'
+                          '    Data Type: uchar2\n'
+                          '    Data Kind: User',
+                          '22:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: uchar3\n'
+                          '    Data Kind: User',
+                          '23:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: uchar4\n'
+                          '    Data Kind: User',
+                          '24:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: ushort\n'
+                          '    Data Kind: User',
+                          '25:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(12, 0, 0\)\n'
+                          '    Data Type: ushort2\n'
+                          '    Data Kind: User',
+                          '26:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(1, 6, 0\)\n'
+                          '    Data Type: ushort3\n'
+                          '    Data Kind: User',
+                          '27:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: ushort4\n'
+                          '    Data Kind: User',
+                          '28:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: uint\n'
+                          '    Data Kind: User',
+                          '29:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(12, 0, 0\)\n'
+                          '    Data Type: uint2\n'
+                          '    Data Kind: User',
+                          '30:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: uint3\n'
+                          '    Data Kind: User',
+                          '31:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(1, 1, 6\)\n'
+                          '    Data Type: uint4\n'
+                          '    Data Kind: User',
+                          '32:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(4, 3, 2\)\n'
+                          '    Data Type: ulong\n'
+                          '    Data Kind: User',
+                          '33:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(12, 0, 0\)\n'
+                          '    Data Type: ulong2\n'
+                          '    Data Kind: User',
+                          '34:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: ulong3\n'
+                          '    Data Kind: User',
+                          '35:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: ulong4\n'
+                          '    Data Kind: User'
+                         ])
+
+    @wimpy
+    @ordered_test(4)
+    def test_continue_2(self):
+        self.try_command('breakpoint del 2',
+                         ['1 breakpoints deleted'])
+
+        # Hit third kernel
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+    @ordered_test(5)
+    def test_allocation_list_all3(self):
+        self.try_command('language renderscript allocation list',
+                         [],
+                         ['2:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(64, 64, 0\)\n'
+                          '    Data Type: uchar4\n'
+                          '    Data Kind: RGBA Pixel',
+                          '7:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: short\n'
+                          '    Data Kind: User',
+                          '28:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: uint\n'
+                          '    Data Kind: User',
+                          '36:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: half\n'
+                          '    Data Kind: User',
+                          '37:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(12, 0, 0\)\n'
+                          '    Data Type: half2\n'
+                          '    Data Kind: User',
+                          '38:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(1, 6, 0\)\n'
+                          '    Data Type: half3\n'
+                          '    Data Kind: User',
+                          '39:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: half4\n'
+                          '    Data Kind: User',
+                          '40:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: float\n'
+                          '    Data Kind: User',
+                          '41:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(12, 0, 0\)\n'
+                          '    Data Type: float2\n'
+                          '    Data Kind: User',
+                          '42:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(6, 0, 0\)\n'
+                          '    Data Type: float3\n'
+                          '    Data Kind: User',
+                          '43:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(3, 2, 0\)\n'
+                          '    Data Type: float4\n'
+                          '    Data Kind: User',
+                          '44:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: double\n'
+                          '    Data Kind: User',
+                          '45:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(4, 1, 3\)\n'
+                          '    Data Type: double2\n'
+                          '    Data Kind: User',
+                          '46:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(1, 2, 3\)\n'
+                          '    Data Type: double3\n'
+                          '    Data Kind: User',
+                          '47:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(1, 2, 3\)\n'
+                          '    Data Type: double4\n'
+                          '    Data Kind: User'])
+
+    @wimpy
+    @ordered_test(6)
+    @java_only_test()
+    def test_allocation_list_all4(self):
+        self.try_command('breakpoint del 3',
+                         ['1 breakpoints deleted'])
+
+        # Hit last kernel
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('language renderscript allocation list',
+                         [],
+                         ['2:\n'
+                          # Regex for non zero hex number
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(64, 64, 0\)\n'
+                          '    Data Type: uchar4\n'
+                          '    Data Kind: RGBA Pixel',
+                          '7:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: short\n'
+                          '    Data Kind: User',
+                          '28:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: uint\n'
+                          '    Data Kind: User',
+                          '46:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(1, 2, 3\)\n'
+                          '    Data Type: double3\n'
+                          '    Data Kind: User',
+                          '48:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: complexStruct\n'
+                          '    Data Kind: User',
+                          '49:\n'
+                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
+                          '    Dimensions: \(24, 0, 0\)\n'
+                          '    Data Type: complexStruct\n'
+                          '    Data Kind: User'])
+
+    @ordered_test(7)
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 3', ['1 breakpoints deleted'])
+
+        self.try_command('process continue',
+                         ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_backtrace.py b/tests/lldb/tests/testcases/test_backtrace.py
new file mode 100644
index 0000000..1e5e79f
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_backtrace.py

@@ -0,0 +1,92 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestBacktrace.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    cpp_only_test,
+)
+
+
+class TestBacktrace(TestBaseRemote):
+    '''Tests breaking on a kernel and a function, and viewing the call stack.'''
+
+    bundle_target = {
+        'java': 'BranchingFunCalls',
+        'jni': 'JNIBranchingFunCalls',
+        'cpp': 'CppBranchingFunCalls'
+    }
+
+    def test_kernel_backtrace(self):
+        # pylint: disable=line-too-long
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('language renderscript kernel breakpoint set simple_kernel',
+                         ['Breakpoint(s) created',
+                          '(pending)'])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('bt',
+                         ['stop reason = breakpoint',
+                          # We should be able to see three functions in bt:
+                          # libRSCpuRef, kernel.expand and the kernel
+                          'frame #2:',
+                          'librs.scalars.so',
+                          'simple_kernel'],
+                         [r'scalars\.rs:6[123]'])
+
+        self.try_command('breakpoint delete 1',
+                         ['1 breakpoints deleted'])
+
+        self.try_command('b set_i',
+                         ['Breakpoint 2',
+                          'set_i'],
+                         [r'scalars\.rs:3[678]'])
+
+        self.try_command('breakpoint list',
+                         ['set_i', 'resolved'])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('bt',
+                         ['stop reason = breakpoint',
+                          # We should be able to see five functions in bt:
+                          # libRSCpuRef, kernel.expand, kernel and two functions
+                          'frame #4:',
+                          'librs.scalars.so',
+                          'modify_i',
+                          'set_i'],
+                         [r'scalars\.rs:3[678]'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 2',
+                         ['1 breakpoints deleted'])
+
+        self.try_command('process continue',
+                         ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_breakpoint_coordinate.py b/tests/lldb/tests/testcases/test_breakpoint_coordinate.py
new file mode 100644
index 0000000..319d4f2
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_breakpoint_coordinate.py

@@ -0,0 +1,177 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestBreakpointCoordinate.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    wimpy,
+    ordered_test,
+    cpp_only_test,
+)
+from harness.assert_mixins import CoordinateAssertionsMixin
+
+
+class TestBreakpointCoordinate(TestBaseRemote, CoordinateAssertionsMixin):
+    '''Tests breaking on a specific kernel invocation.
+
+    Uses the -c option to specify the coordinate.
+    '''
+
+    bundle_target = {
+        'java': 'Allocations',
+        'jni': 'JNIAllocations',
+        'cpp': 'CppAllocations'
+    }
+
+    def setup(self, android):
+        '''This test requires to be run on one thread.
+
+        Args:
+            android: The android_util module.
+        '''
+        android.push_prop('debug.rs.max-threads', 1)
+
+    def teardown(self, android):
+        '''Reset the number of RS threads to the previous value.
+
+        Args:
+            android: The android_util module.
+        '''
+        android.pop_prop('debug.rs.max-threads')
+
+    @wimpy
+    @ordered_test(0)
+    def test_breakpoint_coordinate_2d_swizzle_kernel(self):
+        # pylint: disable=line-too-long
+
+        # test conditional coordinate in two dimensions
+        # breakpoint 1
+        self.assert_coord_bp_set('swizzle_kernel', 3, 7)
+
+        # we will delete this breakpoint before we hit it.
+        # breakpoint 2
+        self.assert_coord_bp_set('swizzle_kernel', 199, 190)
+
+        self.assert_coord_stop('allocs', 'swizzle_kernel', x=3, y=7)
+
+        # check breakpoints that have been hit are disabled
+        self.try_command(
+            'breakpoint list',
+            [
+                "1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled",
+                "2: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1"
+            ]
+        )
+
+        # delete breakpoint on 199,199,0
+        self.try_command('breakpoint delete 2', ['1 breakpoints deleted'])
+
+        # check breakpoints that have been hit are disabled
+        self.try_command(
+            'breakpoint list',
+            ["1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled"]
+        )
+
+        # test conditional coordinate in a single dimension
+        # breakpoint 3
+        self.assert_coord_bp_set('square_kernel', 8)
+
+        # check breakpoints that have been hit are disabled
+        self.try_command(
+            'breakpoint list',
+            [
+                "1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled",
+                "3: RenderScript kernel breakpoint for 'square_kernel', locations = 1"
+            ]
+        )
+
+        self.assert_coord_stop('allocs', 'square_kernel', x=8)
+
+        # check breakpoints that have been hit are disabled
+        self.try_command(
+            'breakpoint list',
+            [
+                "1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled",
+                "3: RenderScript kernel breakpoint for 'square_kernel', locations = 1 Options: disabled"
+            ]
+        )
+
+    @wimpy
+    @ordered_test(1)
+    def test_breakpoint_coordinate_3d_add_half_kernel(self):
+        # test conditional coordinate in three dimensions
+        # breakpoint 4
+        self.assert_coord_bp_set('add_half_kernel', 0, 0, 1)
+        # test we can set more than one conditional kernel breakpoint
+        # and both will be hit;
+        # breakpoint 5
+        self.assert_coord_bp_set('add_half_kernel', 0, 1, 2)
+
+        # Now assert that the next two continue/stop cycles hit our conditionals
+        self.assert_coord_stop('allocs', 'add_half_kernel', x=0, y=0, z=1)
+        self.assert_coord_stop('allocs', 'add_half_kernel', x=0, y=1, z=2)
+
+        # check we can see the coordinate from a function invoked by the kernel
+        # breakpoint 6
+        self.try_command(
+            'break set -n half_helper',
+            ['librs.allocs.so`half_helper']
+        )
+
+        # continue till we hit breakpoint 6
+        self.assert_coord_stop('allocs', 'half_helper', x=0, y=1, z=2)
+
+        self.try_command(
+            'breakpoint list',
+            [
+                "1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled",
+                "3: RenderScript kernel breakpoint for 'square_kernel', locations = 1 Options: disabled",
+                "4: RenderScript kernel breakpoint for 'add_half_kernel', locations = 1 Options: disabled",
+                "5: RenderScript kernel breakpoint for 'add_half_kernel', locations = 1 Options: disabled",
+                "6: name = 'half_helper', locations = 1, resolved = 1, hit count = 1"
+            ]
+        )
+
+        self.try_command('breakpoint delete 3', ['1 breakpoints deleted'])
+
+        self.try_command(
+            'breakpoint list',
+            [
+                "1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled",
+                "4: RenderScript kernel breakpoint for 'add_half_kernel', locations = 1 Options: disabled",
+                "5: RenderScript kernel breakpoint for 'add_half_kernel', locations = 1 Options: disabled",
+                "6: name = 'half_helper', locations = 1, resolved = 1, hit count = 1"
+            ]
+        )
+
+        self.try_command('breakpoint delete 6', ['1 breakpoints deleted'])
+
+        self.try_command(
+            'breakpoint list',
+            [
+                "1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled",
+                "4: RenderScript kernel breakpoint for 'add_half_kernel', locations = 1 Options: disabled",
+                "5: RenderScript kernel breakpoint for 'add_half_kernel', locations = 1 Options: disabled"
+            ]
+        )
+
+    @cpp_only_test()
+    @ordered_test('last')
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 4', ['1 breakpoints deleted'])
+        self.try_command('breakpoint delete 5', ['1 breakpoints deleted'])
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_breakpoint_fileline.py b/tests/lldb/tests/testcases/test_breakpoint_fileline.py
new file mode 100644
index 0000000..be89ca9
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_breakpoint_fileline.py

@@ -0,0 +1,66 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestBreakpointFileLine.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    cpp_only_test,
+    ordered_test
+)
+
+
+class TestBreakpointFileLine(TestBaseRemote):
+    '''Tests the setting of a breakpoint on a specific line of a RS file.'''
+
+    bundle_target = {
+        'java': 'JavaDebugWaitAttach',
+        'jni': 'JNIDebugWaitAttach',
+        'cpp': 'CppDebugWaitAttach'
+    }
+
+    @ordered_test(0)
+    def test_breakpoint_fileline(self):
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('breakpoint set --file simple.rs --line 28',
+                         ['(pending)'])
+
+        self.try_command('process continue',
+                         [])
+
+        self.try_command('bt',
+                         ['librs.simple.so',
+                          'simple_kernel',
+                          'stop reason = breakpoint'])
+
+        self.try_command('breakpoint list',
+                         ['simple.rs',
+                          'resolved = 1'])
+
+        self.try_command('process status',
+                         ['stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('breakpoint delete 1',
+                         ['1 breakpoints deleted'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_breakpoint_fileline_multiple_rs_files.py b/tests/lldb/tests/testcases/test_breakpoint_fileline_multiple_rs_files.py
new file mode 100644
index 0000000..9ba4283
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_breakpoint_fileline_multiple_rs_files.py

@@ -0,0 +1,89 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestBreakpointFileLineMultipleRSFiles.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    cpp_only_test,
+    ordered_test
+)
+
+
+class TestBreakpointFileLineMultipleRSFiles(TestBaseRemote):
+    '''Tests the setting of a breakpoint on one of multiple RS files.'''
+
+    bundle_target = {
+        'java': 'MultipleRSFiles',
+        'jni': 'JNIMultipleRSFiles',
+        'cpp': 'CppMultipleRSFiles'
+    }
+
+    def _binary_name(self):
+        return {
+            'java': 'multiplersfiles',
+            'jni': 'multiplersfiles',
+            'cpp': 'CppMultipleRSFi'
+        }[self.app_type]
+
+    @ordered_test(0)
+    def test_breakpoint_fileline_multiple_files(self):
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('breakpoint set --file first.rs --line 28',
+                         ['(pending)'])
+
+        self.try_command('process continue',
+                         ['stopped',
+                          'librs.first.so`first_kernel',
+                          'at first.rs:28',
+                          "name = '%s'" % self._binary_name(),
+                          'stop reason = breakpoint 1'])
+
+        self.try_command('breakpoint set --file second.rs --line 23',
+                         ['Breakpoint 2',
+                          'librs.second.so`second_kernel',
+                          'second.rs:23'])
+
+        self.try_command('breakpoint list',
+                         ['first.rs',
+                          'second.rs',
+                          'resolved = 1',
+                          'first.rs:28',
+                          'second.rs:23'])
+
+        self.try_command('breakpoint delete 1',
+                         ['1 breakpoints deleted'])
+
+        self.try_command('process continue',
+                         ['stopped',
+                          'librs.second.so`second_kernel',
+                          'at second.rs:23',
+                          "name = '%s'" % self._binary_name(),
+                          'stop reason = breakpoint 2'])
+
+        self.try_command('process status',
+                         ['stopped',
+                          'stop reason = breakpoint'])
+
+    @cpp_only_test()
+    @ordered_test('last')
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 2', ['1 breakpoints deleted'])
+
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_breakpoint_kernel_1.py b/tests/lldb/tests/testcases/test_breakpoint_kernel_1.py
new file mode 100644
index 0000000..a501b66
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_breakpoint_kernel_1.py

@@ -0,0 +1,96 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestBreakpointKernel1.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    cpp_only_test
+)
+
+
+class TestBreakpointKernel1(TestBaseRemote):
+    '''Tests the setting of a breakpoint on a RS kernel.'''
+
+    bundle_target = {
+        'java': 'JavaDebugWaitAttach',
+        'jni': 'JNIDebugWaitAttach',
+        'cpp': 'CppDebugWaitAttach'
+    }
+
+    @ordered_test(0)
+    def test_breakpoint_set_nonexistent_kernel(self):
+        # pylint: disable=line-too-long
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('language renderscript kernel breakpoint set simple_kernel',
+                         ['Breakpoint(s) created',
+                          '(pending)'])
+
+        # Try set a breakpoint on a kernel which doesn't exist
+        self.try_command('language renderscript kernel breakpoint set imaginary_kernel',
+                         ['Breakpoint(s) created',
+                          '(pending)'])
+
+        self.try_command('breakpoint list',
+                         ["'simple_kernel', locations = 0 (pending)",
+                          "'imaginary_kernel', locations = 0 (pending)"])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('bt',
+                         ['stop reason = breakpoint',
+                          'frame #0:',
+                          'librs.simple.so',
+                          'simple_kernel'])
+
+        self.try_command('breakpoint list',
+                         ["'imaginary_kernel', locations = 0 (pending)",
+                          "'simple_kernel', locations = 1, resolved = 1"])
+
+    @ordered_test(1)
+    def test_breakpoint_delete_nonexistent_kernel(self):
+        # Delete breakpoint on kernel which doesn't exist
+        self.try_command('breakpoint delete 2',
+                         ['1 breakpoints deleted'])
+
+        self.try_command('breakpoint list',
+                         ["'simple_kernel', locations = 1, resolved = 1"])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('breakpoint list',
+                         ["'simple_kernel', locations = 1, resolved = 1"])
+
+        self.try_command('breakpoint delete 1',
+                         ['1 breakpoints deleted'])
+
+        self.try_command('breakpoint list',
+                         ['No breakpoints currently set'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_breakpoint_kernel_2.py b/tests/lldb/tests/testcases/test_breakpoint_kernel_2.py
new file mode 100644
index 0000000..6dea13f
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_breakpoint_kernel_2.py

@@ -0,0 +1,55 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestBreakpointKernel2.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+
+
+class TestBreakpointKernel2(TestBaseRemote):
+    '''Tests the setting of a breakpoint on a RS kernel.'''
+
+    bundle_target = {
+        'java': 'JavaInfiniteLoop',
+        'jni': 'JNIInfiniteLoop',
+        'cpp': 'CppInfiniteLoop'
+    }
+
+    def test_breakpoint_resolution_simple_kernel(self):
+        # pylint: disable=line-too-long
+        self.try_command('language renderscript kernel breakpoint set simple_kernel',
+                         ['Breakpoint(s) created'])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('bt',
+                         ['stop reason = breakpoint',
+                          'frame #0:',
+                          'librs.infiniteloop.so',
+                          'simple_kernel'],
+                         [r'infiniteloop\.rs:4[34]'])
+
+        self.try_command('breakpoint list',
+                         ['simple_kernel',
+                          'resolved = 1'])
+
+        self.try_command('process status',
+                         ['stopped',
+                          '.so`simple_kernel',
+                          'stop reason = breakpoint'])

diff --git a/tests/lldb/tests/testcases/test_breakpoint_kernel_all.py b/tests/lldb/tests/testcases/test_breakpoint_kernel_all.py
new file mode 100644
index 0000000..ea988e4
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_breakpoint_kernel_all.py

@@ -0,0 +1,94 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestBreakpointKernelAll.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    cpp_only_test,
+)
+
+
+class TestBreakpointKernelAll(TestBaseRemote):
+    '''Tests setting breakpoints on every RS kernel.'''
+
+    bundle_target = {
+        'java': 'JavaDebugWaitAttach',
+        'jni': 'JNIDebugWaitAttach',
+        'cpp': 'CppDebugWaitAttach'
+    }
+
+    @ordered_test(0)
+    def test_kernel_breakpoint_all_unloaded_kernels(self):
+        # Test command works with no kernels currently loaded
+        self.try_command('language renderscript kernel breakpoint all enable',
+                         ['Breakpoints will be set on all kernels'])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('breakpoint list',
+                         ["'simple_kernel', locations = 1, resolved = 1",
+                          "'other_kernel', locations = 1, resolved = 1"])
+
+        # Check disable doesn't delete breakpoints
+        self.try_command('language renderscript kernel breakpoint all disable',
+                         ['Breakpoints will not be set on any new kernels'])
+
+        # Delete all breakpoints manually
+        self.try_command('breakpoint delete 1',
+                         ['1 breakpoints deleted'])
+
+        self.try_command('breakpoint delete 2',
+                         ['1 breakpoints deleted'])
+
+        self.try_command('breakpoint list',
+                         ["No breakpoints currently set"])
+
+        # Test command works when kernels are loaded
+        self.try_command('language renderscript kernel breakpoint all enable',
+                         ['Breakpoints will be set on all kernels'])
+
+        self.try_command('breakpoint list',
+                         ["'simple_kernel', locations = 1, resolved = 1",
+                          "'other_kernel', locations = 1, resolved = 1"])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('breakpoint delete 3',
+                         ['1 breakpoints deleted'])
+
+        # Check other_kernel breakpoint gets hit
+        self.try_command('breakpoint list',
+                         ["'other_kernel', locations = 1, resolved = 1"])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 4', ['1 breakpoints deleted'])
+
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_breakpoint_kernel_all_multiple_rs_files.py b/tests/lldb/tests/testcases/test_breakpoint_kernel_all_multiple_rs_files.py
new file mode 100644
index 0000000..675b5a3
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_breakpoint_kernel_all_multiple_rs_files.py

@@ -0,0 +1,100 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestBreakpointKernelAllMultipleRSFiles.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    wimpy,
+    ordered_test,
+    cpp_only_test,
+)
+
+
+class TestBreakpointKernelAllMultipleRSFiles(TestBaseRemote):
+    '''Tests setting breakpoints on every RS kernel in multiple kernel files.'''
+
+    bundle_target = {
+        'java': 'MultipleRSFiles',
+        'jni': 'JNIMultipleRSFiles',
+        'cpp': 'CppMultipleRSFiles'
+    }
+
+    @ordered_test(0)
+    def test_deferred_breakpoint_resolution(self):
+        # Test command works with no kernels currently loaded
+        self.try_command('language renderscript kernel breakpoint all enable',
+                         ['Breakpoints will be set on all kernels'])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('breakpoint list',
+                         ["'first_kernel', locations = 1, resolved = 1",
+                          "'second_kernel', locations = 1, resolved = 1"])
+
+    @ordered_test(1)
+    def test_disable_all_kernel_breakpoint_doesnt_delete_breakpoints(self):
+        # Check disable doesn't delete breakpoints
+        self.try_command('language renderscript kernel breakpoint all disable',
+                         ['Breakpoints will not be set on any new kernels'])
+
+        # Delete all breakpoints manually
+        self.try_command('breakpoint delete 1',
+                         ['1 breakpoints deleted'])
+
+        self.try_command('breakpoint delete 2',
+                         ['1 breakpoints deleted'])
+
+        self.try_command('breakpoint list',
+                         ["No breakpoints currently set"])
+
+    @ordered_test(2)
+    def test_enable_breakpoint_on_loaded_kernels(self):
+        # Test command works when kernels are loaded
+        self.try_command('language renderscript kernel breakpoint all enable',
+                         ['Breakpoints will be set on all kernels'])
+
+        self.try_command('breakpoint list',
+                         ["'first_kernel', locations = 1, resolved = 1",
+                          "'second_kernel', locations = 1, resolved = 1"])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        self.try_command('breakpoint delete 3',
+                         ['1 breakpoints deleted'])
+
+        # Check other_kernel breakpoint gets hit
+        self.try_command('breakpoint list',
+                         ["'second_kernel', locations = 1, resolved = 1"])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 4', ['1 breakpoints deleted'])
+
+        self.try_command('process continue', ['exited with status = 0'])
+

diff --git a/tests/lldb/tests/testcases/test_breakpoint_kernel_multiple_rs_files.py b/tests/lldb/tests/testcases/test_breakpoint_kernel_multiple_rs_files.py
new file mode 100644
index 0000000..5cb29d0
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_breakpoint_kernel_multiple_rs_files.py

@@ -0,0 +1,93 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestBreakpointKernelMultipleRSFiles.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    cpp_only_test
+)
+
+
+class TestBreakpointKernelMultipleRSFiles(TestBaseRemote):
+    '''Tests the setting of a breakpoint on RS kernels in multiple files.'''
+
+    bundle_target = {
+        'java': 'MultipleRSFiles',
+        'jni': 'JNIMultipleRSFiles',
+        'cpp': 'CppMultipleRSFiles'
+    }
+
+    def _binary_name(self):
+        return {
+             'java': 'multiplersfiles',
+             'jni': 'multiplersfiles',
+             'cpp': 'CppMultipleRSFi'
+         }[self.app_type]
+
+    def test_kernel_breakpoint_multiple_rs_files(self):
+        # pylint: disable=line-too-long
+        self.try_command('language renderscript kernel breakpoint set first_kernel',
+                         ['Breakpoint(s) created',
+                          '(pending)'])
+
+        self.try_command('breakpoint list',
+                         ["'first_kernel', locations = 0 (pending)"])
+
+        self.try_command('process continue',
+                         ['stopped',
+                          'librs.first.so`first_kernel',
+                          "name = '%s'" % self._binary_name(),
+                          'stop reason = breakpoint 1'],
+                          [r'at first\.rs:2[678]'])
+
+        self.try_command('breakpoint list',
+                         ["'first_kernel', locations = 1, resolved = 1"])
+
+        self.try_command('language renderscript kernel breakpoint set second_kernel',
+                         ['Breakpoint(s) created',
+                          'Breakpoint 2',
+                          'Breakpoint(s) created'],
+                          [r"librs\.second\.so`second_kernel at second\.rs:2[012]",])
+
+        self.try_command('breakpoint list',
+                         ["'first_kernel', locations = 1, resolved = 1",
+                          "'second_kernel', locations = 1, resolved = 1"])
+
+        self.try_command('breakpoint delete 1',
+                         ['1 breakpoints deleted'])
+
+        self.try_command('breakpoint list',
+                         ["'second_kernel', locations = 1, resolved = 1"])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint',
+                          "librs.second.so`second_kernel"],
+                          [r'second\.rs:2[012]'])
+
+        self.try_command('breakpoint delete 2',
+                         ['1 breakpoints deleted'])
+
+        self.try_command('breakpoint list',
+                         ['No breakpoints currently set'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_call_api_funs.py b/tests/lldb/tests/testcases/test_call_api_funs.py
new file mode 100644
index 0000000..d94df3a
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_call_api_funs.py

@@ -0,0 +1,197 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestCallApiFuns.'''
+
+from __future__ import absolute_import
+
+import re
+import string
+
+from harness.test_base_remote import TestBaseRemote
+from harness import RS_funs
+from harness.decorators import (
+    wimpy,
+    ordered_test,
+    cpp_only_test,
+)
+
+
+class _APIFunsExprTestsMeta(type):
+    """
+    Generate unique, standalone test methods from a list of lldb expressions.
+    The lldb expression evaluation engine for calling RenderScript
+    builtins need to be tested thoroughly; rather than manually
+    write the 1000s of individual test cases, we automatically generate them
+    and their variants to add to the test class. This is done from a list
+    of expressions that are all tested in the same way.
+    """
+    def __new__(self, name, bases, class_dict):
+        func_name_sub = re.compile(r'[%s\s]+' % string.punctuation)
+
+        for count, line in enumerate(RS_funs.FUNC_LIST):
+            def make_test(line):
+                """
+                We use an extra level of indirection here to properly
+                close over the *value* of the loop variable, `line`
+                """
+                @ordered_test(count)
+                def test(self):
+                    # build the expression
+                    ret, expr = RS_funs.build_expr(line)
+                    try:
+                        # evaluate the expression with expected return value
+                        self.try_command(expr, [], [RS_funs.TYPE_MAP[ret]])
+                    except KeyError:
+                        # or just check the return type if no return value
+                        # specified
+                        self.try_command(expr, '(%s)' % ret)
+                return test
+
+            # Make a pretty python method that adheres to the testcase standard
+            # Use the `count` parameter to ensure the name is unique in the class
+            test_name = 'test_%s_%s' % (re.sub(func_name_sub, '_', line), count)
+            test = make_test(line)
+            test.func_name = test_name
+            # We mark every 10th test case as runnable in wimpy mode
+            class_dict[test_name] = wimpy(test) if count % 10 == 0 else test
+
+        return type(name, bases, class_dict)
+
+
+class TestCallApiFuns(TestBaseRemote):
+    '''Tests calling of some RS API functions. This tests that JITing works.'''
+
+    __metaclass__ = _APIFunsExprTestsMeta
+
+    bundle_target = {
+        'java': "KernelVariables",
+        'jni': "JNIKernelVariables",
+        'cpp': "CppKernelVariables"
+    }
+
+    @wimpy
+    @ordered_test(-2)
+    def test_setup(self):
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('b -f simple.rs -l 145', [])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+    @wimpy
+    @ordered_test(-1)
+    def test_call_api_funs_atomic(self):
+        # Test the atomics separately because we want to check the output
+        # AtomicAdd(1234, 2)
+        self.try_command('expr rsAtomicAdd(&int_global, 2)',
+                         ['1234'],
+                         [r'\(int(32_t)?\)'])
+
+        self.try_command('expr int_global',
+                         ['(int)',
+                          '1236'])
+
+        # AtomicAnd(2345, 333)
+        self.try_command('expr rsAtomicAnd(&uint_global, 333)',
+                         ['2345'],
+                         [r'\(int(32_t)?\)'])
+
+        self.try_command('expr uint_global',
+                         ['(uint)',
+                          '265'])
+
+        # AtomicCas(1236, 1236, 2345)
+        self.try_command('expr rsAtomicCas(&int_global, 1236, 2345)',
+                         ['1236'],
+                         [r'\(int(32_t)?\)'])
+
+        self.try_command('expr int_global',
+                         ['(int)',
+                          '2345'])
+
+        # AtomicDec(265)
+        self.try_command('expr rsAtomicDec(&uint_global)',
+                         ['265'],
+                         [r'\(int(32_t)?\)'])
+
+        self.try_command('expr uint_global',
+                         ['(uint)',
+                          '264'])
+
+        # AtomicInc(2345)
+        self.try_command('expr rsAtomicInc(&int_global)',
+                         ['2345'],
+                         [r'\(int(32_t)?\)'])
+
+        self.try_command('expr int_global',
+                         ['(int)',
+                          '2346'])
+
+        # AtomicMax(264, 3456)
+        self.try_command('expr rsAtomicMax(&uint_global, 3456)',
+                         ['264'],
+                         [r'\(uint(32_t)?\)'])
+
+        self.try_command('expr uint_global',
+                         ['(uint)',
+                          '3456'])
+
+        # AtomicMin(2346, 3)
+        self.try_command('expr rsAtomicMin(&int_global, 3)',
+                         ['2346'],
+                         [r'\(int(32_t)?\)'])
+
+        self.try_command('expr int_global',
+                         ['(int)',
+                          '3'])
+
+        # AtomicOr(3, 456)
+        self.try_command('expr rsAtomicOr(&int_global, 456)',
+                         ['3'],
+                         [r'\(int(32_t)?\)'])
+
+        self.try_command('expr int_global',
+                         ['(int)',
+                          '459'])
+
+        # AtomicSub(3456, 7)
+        self.try_command('expr rsAtomicSub(&uint_global, 7)',
+                         ['3456'],
+                         [r'\(int(32_t)?\)'])
+
+        self.try_command('expr uint_global',
+                         ['(uint)',
+                          '3449'])
+
+        # AtomicXor(459, 89)
+        self.try_command('expr rsAtomicXor(&int_global, 89)',
+                         ['459'],
+                         [r'\(int(32_t)?\)'])
+
+        self.try_command('expr int_global',
+                         ['(int)',
+                          '402'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
+
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_coordinates.py b/tests/lldb/tests/testcases/test_coordinates.py
new file mode 100644
index 0000000..8680795
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_coordinates.py

@@ -0,0 +1,128 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestCoordinates.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    cpp_only_test
+)
+
+
+class TestCoordinates(TestBaseRemote):
+    '''Tests the inspection of coordinates.
+
+    Tests the inspection of the range and dimension of coordinates as well
+    as the current coordinates.'''
+
+    bundle_target = {
+        'java': 'JavaDebugWaitAttach',
+        'jni': 'JNIDebugWaitAttach',
+        'cpp': 'CppDebugWaitAttach'
+    }
+
+    def setup(self, android):
+        '''This test requires to be run on one thread.
+
+        Args:
+            android: The android_util module.
+        '''
+        android.push_prop('debug.rs.max-threads', 1)
+
+    def teardown(self, android):
+        '''Reset the number of RS threads to the previous value.
+
+        Args:
+            android: The android_util module.
+        '''
+        android.pop_prop('debug.rs.max-threads')
+
+    @ordered_test(0)
+    def test_inspect_coordinates(self):
+        # pylint: disable=line-too-long
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('language renderscript kernel breakpoint set simple_kernel',
+                         ['Breakpoint(s) created',
+                          '(pending)'])
+
+        # Check the initial conditions.
+        self._lldb_continue()
+        self._inspect_coordinates(0, 0, 0)
+
+        # Check two more steps.
+        self._lldb_continue()
+        self._inspect_coordinates(1, 0, 0)
+        self._lldb_continue()
+        self._inspect_coordinates(2, 0, 0)
+
+        # After eight more steps we should have advanced one step in the y dimension.
+        for _ in range(8):
+            self._lldb_continue()
+        self._inspect_coordinates(2, 1, 0)
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
+
+        self.try_command('process continue',
+                         ['exited with status = 0'])
+
+    def _lldb_continue(self):
+        '''Try 'continue' lldb command. Expect to hit a breakpoint.'''
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+    def _inspect_coordinates(self, x_coord, y_coord, z_coord):
+        '''Run lldb commands to inspect kernel size and coordinates
+        and match against expected values.
+
+        Args:
+            (x_coord, y_coord, z_coord): The expected coordinates (int triple)
+
+        Raises:
+            TestFail: One of the lldb commands did not provide the expected
+                      output.
+        '''
+        self.try_command('language renderscript kernel coordinate',
+                         ['Coordinate: (%d, %d, %d)'
+                          % (x_coord, y_coord, z_coord)])
+
+        self.try_command('frame select 1',
+                         ['librs.simple.so`simple_kernel.expand',
+                         'at generated.rs:1'])
+
+        # Inspect the invocation length, should be the same every time.
+        self.try_command('expr p->dim',
+                         ['x = 8',
+                          'y = 8',
+                          'z = 0'])
+
+        # The X coordinate is in the rsIndex variable.
+        self.try_command('expr rsIndex',
+                          ['= ' + str(x_coord)])
+
+        # Inspect the Y and Z coordinates.
+        self.try_command('expr p->current',
+                         ['x = ' + str(0),
+                          'y = ' + str(y_coord),
+                          'z = ' + str(z_coord)])

diff --git a/tests/lldb/tests/testcases/test_dwarf_lang.py b/tests/lldb/tests/testcases/test_dwarf_lang.py
new file mode 100644
index 0000000..08cf859
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_dwarf_lang.py

@@ -0,0 +1,36 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test DWARF language attribute test.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+
+
+class TestDWARFLang(TestBaseRemote):
+    '''Tests the DWARF language attribute is present in RenderScript kernels.'''
+
+    bundle_target = {
+        'java': 'JavaDebugWaitAttach',
+        'jni': 'JNIDebugWaitAttach',
+        'cpp': 'CppDebugWaitAttach'
+    }
+
+    def test_renderscript_kernel_frame_dwarf_language(self):
+        self.try_command('language renderscript status', [])
+        self.try_command('b simple_kernel', [])
+        self.try_command('process continue', [])
+
+        self.assert_lang_renderscript()

diff --git a/tests/lldb/tests/testcases/test_invoke_fun.py b/tests/lldb/tests/testcases/test_invoke_fun.py
new file mode 100644
index 0000000..2648558
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_invoke_fun.py

@@ -0,0 +1,69 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestInvokeFun.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    cpp_only_test
+)
+
+
+class TestInvokeFun(TestBaseRemote):
+    '''Tests debugging a function executed from Java using invoke_*.'''
+
+    bundle_target = {
+        'java': 'BranchingFunCalls',
+        'jni': 'JNIBranchingFunCalls',
+        'cpp': 'CppBranchingFunCalls'
+    }
+
+    def test_invoke_fun(self):
+        # pylint: disable=line-too-long
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('breakpoint set --name addToGlobal',
+                         ['Breakpoint 1', '(pending)'])
+
+        self.try_command('process continue',
+                         ['stopped',
+                          'stop reason = breakpoint'],
+                         [r'scalars\.rs:7[345]'])
+
+        self.try_command('language renderscript kernel breakpoint set simple_kernel',
+                         ['Breakpoint 2', 'Breakpoint(s) created'])
+
+        self.try_command('process continue',
+                         ['stopped',
+                          'stop reason = breakpoint',
+                          'simple_kernel'],
+                         [r'scalars\.rs:6[123]'])
+
+        self.try_command('expr glob',
+                         ['(int)',
+                          '357'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
+
+        self.try_command('breakpoint delete 2', ['1 breakpoints deleted'])
+
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_language.py b/tests/lldb/tests/testcases/test_language.py
new file mode 100644
index 0000000..8432781
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_language.py

@@ -0,0 +1,39 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestLanguage.'''
+
+from __future__ import absolute_import
+
+from harness.test_base import TestBaseNoTargetProcess
+
+
+class TestLanguage(TestBaseNoTargetProcess):
+    '''
+    Tests the "language" command and "language renderscript" subcommand.
+    '''
+
+    def test_lldb_has_language_commands(self):
+        ci = self._ci
+        self.assert_true(
+            ci.HasCommands() and
+            ci.CommandExists('language')
+        )
+
+        self.try_command('language', ['renderscript'])
+        self.try_command('language renderscript', ['kernel',
+                                                   'context',
+                                                   'module',
+                                                   'status'])
+

diff --git a/tests/lldb/tests/testcases/test_language_subcmds.py b/tests/lldb/tests/testcases/test_language_subcmds.py
new file mode 100644
index 0000000..67c8bd1
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_language_subcmds.py

@@ -0,0 +1,162 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestLanguageSubcmds.'''
+
+from __future__ import absolute_import
+
+import os
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    cpp_only_test,
+    ordered_test,
+)
+
+
+class TestLanguageSubcmds(TestBaseRemote):
+    '''Tests the 'language renderscript' subcommands.'''
+
+    bundle_target = {
+        'java': 'JavaDebugWaitAttach',
+        'jni': 'JNIDebugWaitAttach',
+        'cpp': 'CppDebugWaitAttach'
+    }
+
+    def setup(self, android):
+        '''This test requires to be run on one thread.'''
+        android.push_prop('debug.rs.max-threads', 1)
+
+    def teardown(self, android):
+        '''Reset the number of RS threads to the previous value.'''
+        android.pop_prop('debug.rs.max-threads')
+
+    def _pkg_name(self):
+        return {
+            'java': 'com.android.rs.waitattachdebug',
+            'jni': 'com.android.rs.jnidebugwaitattach',
+            'cpp': 'com.android.rs.cppwaitattach'
+        }[self.app_type]
+
+    def test_language_subcommands(self):
+        self.try_command('language',
+                         [])
+
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered',
+                          'Runtime functions hooked',
+                          'rsdAllocationInit',
+                          'rsdAllocationRead2D',
+                          'rsdScriptInit',
+                          'rsdScriptInvokeForEach',
+                          'rsdScriptInvokeForEachMulti',
+                          'rsdScriptSetGlobalVar'])
+
+        self.try_command('breakpoint set --file simple.rs --line 28',
+                         ['(pending)'])
+
+        self.try_command('process continue',
+                         [])
+
+        self.try_command('language renderscript kernel',
+                         ['breakpoint',
+                          'coordinate',
+                          'list'])
+
+        self.try_command('language renderscript kernel breakpoint',
+                         ['all',
+                          'set'])
+
+        self.try_command('language renderscript kernel list',
+                         ['RenderScript Kernels',
+                          "Resource 'simple'",
+                          'root',
+                          'simple_kernel'])
+
+        self.try_command('language renderscript kernel coordinate',
+                         ['Coordinate: (0, 0, 0)'])
+
+        self.try_command('language renderscript context',
+                         ['dump'])
+
+        self.try_command('language renderscript context dump',
+                         ['Inferred RenderScript Contexts',
+                          '1 script instances'])
+
+        self.try_command('language renderscript allocation',
+                         ['list',
+                          'load',
+                          'save',
+                          'dump',
+                          'refresh'])
+
+        self.try_command('language renderscript allocation list',
+                         ['RenderScript Allocations:'])
+
+        self.try_command('language renderscript allocation list -i 0',
+                         ['RenderScript Allocations:'])
+
+        self.try_command('language renderscript allocation list --id 0',
+                         ['RenderScript Allocations:'])
+
+        self.try_command('language renderscript allocation dump 1',
+                         ['Data (X, Y, Z):'])
+
+        output_file = self.get_tmp_file_path()
+        self.try_command('language renderscript allocation dump 1 -f ' +
+                         output_file,
+                         ["Results written to '%s'" % output_file])
+
+        if os.path.isfile(output_file):
+            os.remove(output_file)
+
+        self.try_command('language renderscript allocation dump 1 --file ' +
+                         output_file,
+                         ["Results written to '%s'" % output_file])
+
+        self.try_command('language renderscript allocation save 1 ' +
+                         output_file,
+                         ["Allocation written to file '%s'" % output_file])
+
+        self.try_command('language renderscript allocation load 1 ' +
+                         output_file,
+                         ["Contents of file '%s' read into allocation 1" %
+                          output_file])
+
+        self.try_command('language renderscript allocation refresh',
+                         ['All allocations successfully recomputed'])
+
+        self.try_command('language renderscript module',
+                         ['dump'])
+
+        self.try_command('language renderscript module dump',
+                         ['RenderScript Modules:',
+                          'librs.simple.so',
+                          'Debug info loaded',
+                          'Globals: 1',
+                          'gColor - float4',
+                          'Kernels: 3',
+                          'root',
+                          'simple_kernel',
+                          'other_kernel',
+                          'java_package_name: %s' % self._pkg_name(),
+                          'version:'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
+
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_language_subcmds_no_debug.py b/tests/lldb/tests/testcases/test_language_subcmds_no_debug.py
new file mode 100644
index 0000000..c57343d
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_language_subcmds_no_debug.py

@@ -0,0 +1,146 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestLanguageSubcmdsNoDebug.'''
+
+from __future__ import absolute_import
+
+import os
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    cpp_only_test,
+    ordered_test,
+)
+
+
+class TestLanguageSubcmdsNoDebug(TestBaseRemote):
+    '''Tests the 'language renderscript' subcommands without debug info.
+
+    In particular, module dump should report missing debug info.
+    '''
+
+    bundle_target = {
+        'java': 'JavaNoDebugWaitAttach',
+        'jni': 'JNINoDebugWaitAttach',
+        'cpp': 'CppNoDebugWaitAttach'
+    }
+
+    def _pkg_name(self):
+        return {
+            'java': 'com.android.rs.waitattachnodebug',
+            'jni': 'com.android.rs.jninodebugwaitattach',
+            'cpp': 'com.android.rs.cppwaitattach'
+        }[self.app_type]
+
+    @ordered_test(0)
+    def test_language_subcommands_no_debug(self):
+        # pylint: disable=line-too-long
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('language renderscript kernel breakpoint set simple_kernel'
+                         '',
+                         ['(pending)'])
+
+        self.try_command('process continue',
+                         [])
+
+        self.try_command('language renderscript kernel',
+                         ['breakpoint',
+                          'coordinate',
+                          'list'])
+
+        self.try_command('language renderscript kernel list',
+                         ['RenderScript Kernels',
+                          "Resource 'simple'",
+                          'root',
+                          'simple_kernel'])
+
+        self.try_command('language renderscript context',
+                         ['dump'])
+
+        self.try_command('language renderscript context dump',
+                         ['Inferred RenderScript Contexts',
+                          '1 script instances'])
+
+        self.try_command('language renderscript allocation',
+                         ['list',
+                          'load',
+                          'save',
+                          'dump',
+                          'refresh'])
+
+        self.try_command('language renderscript allocation list',
+                         ['RenderScript Allocations:'])
+
+        self.try_command('language renderscript allocation list -i 0',
+                         ['RenderScript Allocations:'])
+
+        self.try_command('language renderscript allocation list --id 0',
+                         ['RenderScript Allocations:'])
+
+        self.try_command('language renderscript allocation dump 1',
+                         ['Data (X, Y, Z):'])
+
+        output_file = self.get_tmp_file_path()
+        self.try_command('language renderscript allocation dump 1 -f ' +
+                         output_file,
+                         ["Results written to '%s'" % output_file])
+
+        if os.path.isfile(output_file):
+            os.remove(output_file)
+
+        self.try_command('language renderscript allocation dump 1 --file ' +
+                         output_file,
+                         ["Results written to '%s'" % output_file])
+
+        self.try_command('language renderscript allocation save 1 ' +
+                         output_file,
+                         ["Allocation written to file '%s'" % output_file])
+
+        self.try_command('language renderscript allocation load 1 ' +
+                         output_file,
+                         ["Contents of file '%s' read into allocation 1" %
+                          output_file])
+
+        self.try_command('language renderscript allocation refresh',
+                         ['All allocations successfully recomputed'])
+
+        # C++ tests have an additional kernel `other_kernel`
+        kernel_count = 3 if self.app_type == 'cpp' else 2
+        self.try_command('language renderscript module',
+                         ['dump'])
+
+        self.try_command('language renderscript module dump',
+                         ['RenderScript Modules:',
+                          'librs.simple.so',
+                          'Debug info does not exist.',
+                          'Globals: 1',
+                          'gColor - variable identified, but not found in '
+                            'binary (symbol exists)',
+                          'Kernels: %s' % kernel_count,
+                          'root',
+                          'simple_kernel',
+                          '',
+                          'java_package_name: %s' % self._pkg_name(),
+                          'version'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
+
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_multiple_rs_files.py b/tests/lldb/tests/testcases/test_multiple_rs_files.py
new file mode 100644
index 0000000..e0497f0
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_multiple_rs_files.py

@@ -0,0 +1,95 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestMultipleRSFiles.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    cpp_only_test,
+)
+
+class TestMultipleRSFiles(TestBaseRemote):
+    '''Tests some commands on an apk which has two rs files.'''
+
+    bundle_target = {
+        'java': 'MultipleRSFiles',
+        'jni': 'JNIMultipleRSFiles',
+        'cpp': 'CppMultipleRSFiles'
+    }
+
+    def _binary_name(self):
+        return {
+            'java': 'multiplersfiles',
+            'jni': 'multiplersfiles',
+            'cpp': 'CppMultipleRSFi'
+        }[self.app_type]
+
+    def _pkg_name(self):
+        return {
+            'java': 'com.android.rs.multiplersfiles',
+            'jni': 'com.android.rs.jnimultiplersfiles',
+            'cpp': 'com.android.rs.cppmultiplersfiles'
+        }[self.app_type]
+
+    def test_multiple_rs_files(self):
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered',
+                          'Runtime functions hooked'])
+
+        self.try_command('breakpoint set --file first.rs --line 28',
+                         ['(pending)'])
+
+        self.try_command('process continue',
+                         ['stopped',
+                          'librs.first.so`first_kernel',
+                          'at first.rs:28',
+                          "name = '%s'" % self._binary_name(),
+                          'stop reason = breakpoint 1'])
+
+        self.try_command('language renderscript kernel list',
+                         ['RenderScript Kernels',
+                          "Resource 'first'",
+                          "Resource 'second'",
+                          'root',
+                          'first_kernel',
+                          'second_kernel'])
+
+        self.try_command('language renderscript context dump',
+                         ['Inferred RenderScript Contexts',
+                          '2 script instances'])
+
+        self.try_command('language renderscript module dump',
+                         ['RenderScript Modules:',
+                          'librs.first.so',
+                          'librs.second.so',
+                          'Debug info loaded',
+                          'Globals: 1',
+                          'gColor - float4',
+                          'Kernels: 2',
+                          'root',
+                          'first_kernel',
+                          'second_kernel',
+                          'java_package_name: %s' % self._pkg_name(),
+                          'version:'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
+
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_read_global.py b/tests/lldb/tests/testcases/test_read_global.py
new file mode 100644
index 0000000..0a337e0
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_read_global.py

@@ -0,0 +1,344 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestReadGlobal.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    wimpy,
+    ordered_test,
+    cpp_only_test,
+)
+
+
+class TestReadGlobal(TestBaseRemote):
+    '''Tests inspecting global variables of all types.'''
+
+    bundle_target = {
+        'java': 'KernelVariables',
+        'jni': 'JNIKernelVariables',
+        'cpp': 'CppKernelVariables'
+    }
+
+    def _try_inspecting_global(self, global_name, expected_output,
+                              expected_regex=None):
+        '''Inspect a global and check for the output.
+
+        Run the "expr" and "target variable" commands on a given global and
+        with a given output. (The commands should be equivalent.)
+
+        Args:
+            global_name: String which is the name of the global to inspect.
+            expected_output: List of strings that should be found in the output.
+            expected_regex: List of regular expressions that should match lldb's
+                            output.
+
+        Raises:
+            TestFail: One of the lldb commands did not provide the expected
+                      output.
+        '''
+        self.try_command('expr ' + global_name,
+                         expected_output,
+                         expected_regex)
+
+        self.try_command('target variable ' + global_name,
+                         expected_output,
+                         expected_regex)
+
+    @wimpy
+    @ordered_test(0)
+    def test_setup(self):
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('b -f simple.rs -l 145', [])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+    @wimpy
+    def test_list_script_globals(self):
+        # pylint: disable=line-too-long
+
+        self.try_command('target variable',
+                         ['Global variables for',
+                          'librs.simple.so',
+                          "(uchar) uchar_global = '\\xea'",
+                          '(short) short_global = -321',
+                          '(ushort) ushort_global = 432',
+                          '(int) int_global = 1234',
+                          '(uint) uint_global = 2345',
+                          '(float) float_global = 4.5',
+                          '(ulong) ulong_global = 8888',
+                          '(double) double_global = -456.5',
+                          '(char2) char2_global = (11, -22)',
+                          '(uchar2) uchar2_global = (0x21, 0x2c)',
+                          '(short2) short2_global = (-555, 666)',
+                          '(ushort2) ushort2_global = (777, 888)',
+                          '(int2) int2_global = (999, -1111)',
+                          '(uint2) uint2_global = (2222, 3333)',
+                          '(float2) float2_global = (4.5, -5)',
+                          '(long2) long2_global = (-4444, 5555)',
+                          '(ulong2) ulong2_global = (6666, 7777)',
+                          '(double2) double2_global = (88.5, -99)',
+                          '(char3) char3_global = (11, -22, -33,',
+                          '(uchar3) uchar3_global = (0x21, 0x2c, 0x37,',
+                          '(short3) short3_global = (-555, 666, 777,',
+                          '(ushort3) ushort3_global = (777, 888, 999,',
+                          '(int3) int3_global = (999, -1111, 2222,',
+                          '(uint3) uint3_global = (2222, 3333, 4444,',
+                          '(float3) float3_global = (4.5, -5, -6.5,',
+                          '(long3) long3_global = (-4444, 5555, 6666,',
+                          '(ulong3) ulong3_global = (6666, 7777, 8888,',
+                          '(double3) double3_global = (88.5, -99, 111.5,',
+                          '(char4) char4_global = (55, 11, -22, -33)',
+                          '(uchar4) uchar4_global = (0xde, 0x21, 0x2c, 0x37)',
+                          '(short4) short4_global = (-444, -555, 666, 777)',
+                          '(ushort4) ushort4_global = (666, 777, 888, 999)',
+                          '(int4) int4_global = (888, 999, -1111, 2222)',
+                          '(uint4) uint4_global = (1111, 2222, 3333, 4444)',
+                          '(float4) float4_global = (3, 4.5, -5, -6.5)',
+                          '(long4) long4_global = (-3333, -4444, 5555, 6666)',
+                          '(ulong4) ulong4_global = (5555, 6666, 7777, 8888)',
+                          '(double4) double4_global = (-77, 88.5, -99, 111.5)',
+                          '(rs_matrix2x2) matrix2x2_global = (m = (1, 2.5, 3, 4.5))',
+                          '(rs_matrix3x3) matrix3x3_global = {\n'
+                          '  m = ([0] = 5, [1] = 6.5, [2] = 7, [3] = 8.5, [4] = 9, [5] = 1.5, [6] = 2, [7] = 3.5, [8] = 4)',
+                          '(rs_matrix4x4) matrix4x4_global = {\n'
+                          '  m = {\n'
+                          '    [0] = 5.5\n'
+                          '    [1] = 6\n'
+                          '    [2] = 7.5\n'
+                          '    [3] = 8\n'
+                          '    [4] = 9\n'
+                          '    [5] = 1.5\n'
+                          '    [6] = 2\n'
+                          '    [7] = 3.5\n'
+                          '    [8] = 4.5\n'
+                          '    [9] = 5.5\n'
+                          '    [10] = 6.5\n'
+                          '    [11] = 7\n'
+                          '    [12] = 8\n'
+                          '    [13] = 9.5\n'
+                          '    [14] = 1.5\n'
+                          '    [15] = 2.5\n'
+                          '  }\n',
+                          '(rs_quaternion) quaternion_global = (4.5, 5.5, 6, 3)'],
+                         [r"\((signed )?char\) char_global = '\\f'",
+                          r'\((long )?long\) long_global = -77777'])
+
+    @wimpy
+    def test_read_char_global(self):
+        # Use expr to inspect locals
+        self._try_inspecting_global('char_global',
+                         ["'\\f'"],
+                         [r'\((signed )?char\)'])
+
+    def test_read_primitive_global(self):
+        self._try_inspecting_global('uchar_global',
+                         ['(uchar)', "'\\xea'"])
+
+        self._try_inspecting_global('short_global',
+                         ['(short)', '-321'])
+
+        self._try_inspecting_global('ushort_global',
+                         ['(ushort)', '432'])
+
+        self._try_inspecting_global('int_global',
+                         ['(int)', '1234'])
+
+        self._try_inspecting_global('uint_global',
+                         ['(uint)', '2345'])
+
+        self._try_inspecting_global('float_global',
+                         ['(float)', '4.5'])
+
+        self._try_inspecting_global('long_global',
+                         ['-77777'],
+                         [r'\((long )?long\)'])
+
+        self._try_inspecting_global('ulong_global',
+                         ['(ulong)', '8888'])
+
+        self._try_inspecting_global('double_global',
+                         ['(double)', '-456.5'])
+
+        self._try_inspecting_global('char2_global',
+                                   ['(char2)', '(11, -22)'])
+
+    @wimpy
+    def test_write_global2(self):
+        self._try_inspecting_global('uchar2_global',
+                               ['(uchar2)', '(0x21, 0x2c)'])
+
+    def test_write_global3(self):
+        self._try_inspecting_global('short2_global',
+                                   ['(short2)', '(-555, 666)'])
+
+        self._try_inspecting_global('ushort2_global',
+                                   ['(ushort2)', '(777, 888)'])
+
+        self._try_inspecting_global('int2_global',
+                                   ['(int2)', '(999, -1111)'])
+
+        self._try_inspecting_global('uint2_global',
+                                   ['(uint2)', '(2222, 3333)'])
+
+        self._try_inspecting_global('float2_global',
+                                   ['(float2)', '(4.5, -5)'])
+
+        self._try_inspecting_global('long2_global',
+                                   ['(long2)', '(-4444, 5555)'])
+
+        self._try_inspecting_global('ulong2_global',
+                                   ['(ulong2)', '(6666, 7777)'])
+
+        self._try_inspecting_global('double2_global',
+                                   ['(double2)', '(88.5, -99)'])
+
+        self._try_inspecting_global('char3_global',
+                                   ['(char3)',
+                                    '(11, -22, -33,'])
+
+        self._try_inspecting_global('uchar3_global',
+                                   ['(uchar3)',
+                                    '(0x21, 0x2c, 0x37,'])
+
+    @wimpy
+    def test_global_write_short3(self):
+        self._try_inspecting_global('short3_global',
+                                   ['(short3)',
+                                   '(-555, 666, 777,'])
+
+    def test_read_vec3(self):
+        self._try_inspecting_global('ushort3_global',
+                                   ['(ushort3)',
+                                    '(777, 888, 999,'])
+
+        self._try_inspecting_global('int3_global',
+                                   ['(int3)',
+                                    '(999, -1111, 2222,'])
+
+        self._try_inspecting_global('uint3_global',
+                                   ['(uint3)',
+                                    '(2222, 3333, 4444,'])
+
+        self._try_inspecting_global('float3_global',
+                                   ['(float3)',
+                                    '(4.5, -5, -6.5,'])
+
+        self._try_inspecting_global('long3_global',
+                                   ['(long3)',
+                                    '(-4444, 5555, 6666,'])
+
+        self._try_inspecting_global('ulong3_global',
+                                   ['(ulong3)',
+                                    '(6666, 7777, 8888,'])
+
+        self._try_inspecting_global('double3_global',
+                                   ['(double3)',
+                                    '(88.5, -99, 111.5,'])
+
+        self._try_inspecting_global('char4_global',
+                                   ['(char4)',
+                                    '(55, 11, -22, -33)'])
+
+        self._try_inspecting_global('uchar4_global',
+                                   ['(uchar4)',
+                                    '(0xde, 0x21, 0x2c, 0x37)'])
+
+        self._try_inspecting_global('short4_global',
+                                   ['(short4)',
+                                    '(-444, -555, 666, 777)'])
+
+    @wimpy
+    def test_read_ushort4(self):
+        self._try_inspecting_global('ushort4_global',
+                                   ['(ushort4)',
+                                    '(666, 777, 888, 999)'])
+
+    def test_read_vec4(self):
+        self._try_inspecting_global('int4_global',
+                                   ['(int4)',
+                                    '(888, 999, -1111, 2222)'])
+
+        self._try_inspecting_global('uint4_global',
+                                   ['(uint4)',
+                                    '(1111, 2222, 3333, 4444)'])
+
+        self._try_inspecting_global('float4_global',
+                                   ['(float4)',
+                                    '(3, 4.5, -5, -6.5)'])
+
+        self._try_inspecting_global('long4_global',
+                                   ['(long4)',
+                                    '(-3333, -4444, 5555, 6666)'])
+
+        self._try_inspecting_global('ulong4_global',
+                                   ['(ulong4)',
+                                    '(5555, 6666, 7777, 8888)'])
+
+        self._try_inspecting_global('double4_global',
+                                   ['(double4)',
+                                    '(-77, 88.5, -99, 111.5)'])
+
+        self._try_inspecting_global('matrix2x2_global',
+                                   ['(rs_matrix2x2)',
+                                    '= (m = (1, 2.5, 3, 4.5))'])
+
+        self._try_inspecting_global('matrix3x3_global',
+                                       ['(rs_matrix3x3)',
+                                        '= {\n'
+                                        '  m = ([0] = 5, [1] = 6.5, [2] = 7, [3] = 8.5, [4] = 9, [5] = 1.5, [6] = 2, [7] = 3.5, [8] = 4)'])
+    @wimpy
+    def test_read_matrix(self):
+        self._try_inspecting_global('matrix4x4_global',
+                                   ['(rs_matrix4x4)',
+                                    '= {\n'
+                                    '  m = {\n'
+                                    '    [0] = 5.5\n'
+                                    '    [1] = 6\n'
+                                    '    [2] = 7.5\n'
+                                    '    [3] = 8\n'
+                                    '    [4] = 9\n'
+                                    '    [5] = 1.5\n'
+                                    '    [6] = 2\n'
+                                    '    [7] = 3.5\n'
+                                    '    [8] = 4.5\n'
+                                    '    [9] = 5.5\n'
+                                    '    [10] = 6.5\n'
+                                    '    [11] = 7\n'
+                                    '    [12] = 8\n'
+                                    '    [13] = 9.5\n'
+                                    '    [14] = 1.5\n'
+                                    '    [15] = 2.5\n'
+                                    '  }\n'])
+
+    @wimpy
+    def test_read_quaternion(self):
+        self._try_inspecting_global('quaternion_global',
+                                   ['(rs_quaternion)',
+                                    '(4.5, 5.5, 6, 3)'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
+
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_read_local.py b/tests/lldb/tests/testcases/test_read_local.py
new file mode 100644
index 0000000..9a6a80f
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_read_local.py

@@ -0,0 +1,344 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestReadLocal.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    wimpy,
+    ordered_test,
+    skip_conditional,
+    cpp_only_test
+)
+
+
+class TestReadLocal(TestBaseRemote):
+    '''Tests inspecting local variables of all types.'''
+
+    bundle_target = {
+        'java': 'KernelVariables',
+        'jni': 'JNIKernelVariables',
+        'cpp': 'CppKernelVariables'
+    }
+
+    def _try_inspecting_local(self, local_name, expected_output,
+                             expected_regex=None):
+        '''Inspect a local and check for the output.
+
+        Run the "expr" and "frame variable" commands on a given local and
+        with a given output. (The commands should be equivalent.)
+
+        Args:
+            local_name: String which is the name of the global to inspect.
+            expected_output: List of strings that should be found in the output.
+            expected_regex: List of regular expressions that should match lldb's
+                            output.
+
+        Raises:
+            TestFail: One of the lldb commands did not provide the expected
+                      output.
+        '''
+        self.try_command('expr ' + local_name,
+                         expected_output,
+                         expected_regex)
+
+        self.try_command('frame variable ' + local_name,
+                         expected_output,
+                         expected_regex)
+
+    @wimpy
+    @ordered_test(0)
+    def test_setup(self):
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('breakpoint set --file simple.rs --line 145', [])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+    @wimpy
+    def test_list_rs_kernel_frame_variables(self):
+        # pylint: disable=line-too-long
+
+        self.try_command('frame variable',
+                         ["(uchar) uchar_local = 'b'",
+                          '(short) short_local = -321',
+                          '(ushort) ushort_local = 432',
+                          '(int) int_local = 1234',
+                          '(uint) uint_local = 2345',
+                          '(float) float_local = 4.5',
+                          '(ulong) ulong_local = 8888',
+                          '(double) double_local = -456.5',
+                          '(char2) char2_local = (-11, -22)',
+                          '(uchar2) uchar2_local = (0x21, 0x2c)',
+                          '(short2) short2_local = (-555, 666)',
+                          '(ushort2) ushort2_local = (777, 888)',
+                          '(int2) int2_local = (999, -1111)',
+                          '(uint2) uint2_local = (2222, 3333)',
+                          '(float2) float2_local = (4.5, -5)',
+                          '(long2) long2_local = (-4444, 5555)',
+                          '(ulong2) ulong2_local = (6666, 7777)',
+                          '(double2) double2_local = (88.5, -99)',
+                          '(char3) char3_local = (11, -22, -33,',
+                          '(uchar3) uchar3_local = (0x21, 0x2c, 0x37,',
+                          '(short3) short3_local = (-555, 666, 777,',
+                          '(ushort3) ushort3_local = (777, 888, 999,',
+                          '(int3) int3_local = (999, -1111, 2222,',
+                          '(uint3) uint3_local = (2222, 3333, 4444,',
+                          '(float3) float3_local = (4.5, -5, -6.5,',
+                          '(long3) long3_local = (-4444, 5555, 6666,',
+                          '(ulong3) ulong3_local = (6666, 7777, 8888,',
+                          '(double3) double3_local = (88.5, -99, 111.5,',
+                          '(char4) char4_local = (55, 11, -22, -33)',
+                          '(uchar4) uchar4_local = (0x16, 0x21, 0x2c, 0x37)',
+                          '(short4) short4_local = (-444, -555, 666, 777)',
+                          '(ushort4) ushort4_local = (666, 777, 888, 999)',
+                          '(int4) int4_local = (888, 999, -1111, 2222)',
+                          '(uint4) uint4_local = (1111, 2222, 3333, 4444)',
+                          '(float4) float4_local = (3, 4.5, -5, -6.5)',
+                          '(long4) long4_local = (-3333, -4444, 5555, 6666)',
+                          '(ulong4) ulong4_local = (5555, 6666, 7777, 8888)',
+                          '(double4) double4_local = (-77, 88.5, -99, 111.5)',
+                          '(rs_matrix2x2) matrix2x2_local = (m = (1, 2.5, 3, 4.5))',
+                          '(rs_matrix3x3) matrix3x3_local = {\n'
+                          '  m = ([0] = 5, [1] = 6.5, [2] = 7, [3] = 8.5, [4] = 9, [5] = 1.5, [6] = 2, [7] = 3.5, [8] = 4)',
+                          '(rs_matrix4x4) matrix4x4_local = {\n'
+                          '  m = {\n'
+                          '    [0] = 5.5\n'
+                          '    [1] = 6\n'
+                          '    [2] = 7.5\n'
+                          '    [3] = 8\n'
+                          '    [4] = 9\n'
+                          '    [5] = 1.5\n'
+                          '    [6] = 2\n'
+                          '    [7] = 3.5\n'
+                          '    [8] = 4.5\n'
+                          '    [9] = 5.5\n'
+                          '    [10] = 6.5\n'
+                          '    [11] = 7\n'
+                          '    [12] = 8\n'
+                          '    [13] = 9.5\n'
+                          '    [14] = 1.5\n'
+                          '    [15] = 2.5\n'
+                          '  }\n',
+                          '(rs_quaternion) quaternion_local = (8, 9, 0.5, 7.5)'],
+                         [r"\((signed )?char\) char_local = 'a'",
+                          r'\((long )?long\) long_local = -77777'])
+
+
+    @wimpy
+    def test_inspect_primitive_types(self):
+        # Use expr to inspect locals
+        self._try_inspecting_local('char_local',
+                                  ["'a'"],
+                                  [r'\((signed )?char\)'])
+
+        self._try_inspecting_local('uchar_local',
+                                  ['(uchar)', "'b'"])
+
+        self._try_inspecting_local('short_local',
+                                  ['(short)', '-321'])
+
+        self._try_inspecting_local('ushort_local',
+                                  ['(ushort)', '432'])
+
+        self._try_inspecting_local('int_local',
+                                  ['(int)', '1234'])
+
+        self._try_inspecting_local('uint_local',
+                                  ['(uint)', '2345'])
+
+        self._try_inspecting_local('float_local',
+                                  ['(float)', '4.5'])
+
+        self._try_inspecting_local('long_local',
+                                  ['-77777'], [r'\((long )?long\)'])
+
+        self._try_inspecting_local('ulong_local',
+                                  ['(ulong)', '8888'])
+
+        self._try_inspecting_local('double_local',
+                                  ['(double)', '-456.5'])
+
+
+    @wimpy
+    def test_inspect_uchar2(self):
+        self._try_inspecting_local('uchar2_local',
+                                  ['(uchar2)', '(0x21, 0x2c)'])
+
+    def test_inspect_vec2_types(self):
+        self._try_inspecting_local('char2_local',
+                                  ['(char2)', '(-11, -22)'])
+
+        self._try_inspecting_local('short2_local',
+                                  ['(short2)', '(-555, 666)'])
+
+        self._try_inspecting_local('ushort2_local',
+                                  ['(ushort2)', '(777, 888)'])
+
+        self._try_inspecting_local('int2_local',
+                                  ['(int2)', '(999, -1111)'])
+
+        self._try_inspecting_local('uint2_local',
+                                  ['(uint2)', '(2222, 3333)'])
+
+        self._try_inspecting_local('float2_local',
+                                  ['(float2)', '(4.5, -5)'])
+
+        self._try_inspecting_local('long2_local',
+                                  ['(long2)', '(-4444, 5555)'])
+
+        self._try_inspecting_local('ulong2_local',
+                                  ['(ulong2)', '(6666, 7777)'])
+
+        self._try_inspecting_local('double2_local',
+                                  ['(double2)', '(88.5, -99)'])
+
+        self._try_inspecting_local('char3_local',
+                                  ['(char3)',
+                                   '(11, -22, -33,'])
+
+        self._try_inspecting_local('uchar3_local',
+                                  ['(uchar3)',
+                                   '(0x21, 0x2c, 0x37,'])
+
+    @wimpy
+    def test_inspect_short3(self):
+        self._try_inspecting_local('short3_local',
+                                  ['(short3)',
+                                   '(-555, 666, 777,'])
+
+    def test_inspect_vec3_types(self):
+        self._try_inspecting_local('ushort3_local',
+                                  ['(ushort3)',
+                                   '(777, 888, 999,'])
+
+        self._try_inspecting_local('int3_local',
+                                  ['(int3)',
+                                   '(999, -1111, 2222,'])
+
+        self._try_inspecting_local('uint3_local',
+                                  ['(uint3)',
+                                   '(2222, 3333, 4444,'])
+
+        self._try_inspecting_local('float3_local',
+                                  ['(float3)',
+                                   '(4.5, -5, -6.5,'])
+
+        self._try_inspecting_local('long3_local',
+                                  ['(long3)',
+                                   '(-4444, 5555, 6666,'])
+
+        self._try_inspecting_local('ulong3_local',
+                                  ['(ulong3)',
+                                   '(6666, 7777, 8888,'])
+
+        self._try_inspecting_local('double3_local',
+                                  ['(double3)',
+                                   '(88.5, -99, 111.5,'])
+
+        self._try_inspecting_local('char4_local',
+                                  ['(char4)',
+                                   '(55, 11, -22, -33)'])
+
+        self._try_inspecting_local('uchar4_local',
+                                  ['(uchar4)',
+                                   '(0x16, 0x21, 0x2c, 0x37)'])
+
+        self._try_inspecting_local('short4_local',
+                                  ['(short4)',
+                                   '(-444, -555, 666, 777)'])
+
+    @wimpy
+    def test_inspect_ushort4(self):
+        self._try_inspecting_local('ushort4_local',
+                                  ['(ushort4)',
+                                   '(666, 777, 888, 999)'])
+
+    def test_inspect_vec4_types(self):
+        self._try_inspecting_local('int4_local',
+                                  ['(int4)',
+                                   '(888, 999, -1111, 2222)'])
+
+        self._try_inspecting_local('uint4_local',
+                                  ['(uint4)',
+                                   '(1111, 2222, 3333, 4444)'])
+
+        self._try_inspecting_local('float4_local',
+                                  ['(float4)',
+                                   '(3, 4.5, -5, -6.5)'])
+
+        self._try_inspecting_local('long4_local',
+                                  ['(long4)',
+                                   '(-3333, -4444, 5555, 6666)'])
+
+        self._try_inspecting_local('ulong4_local',
+                                  ['(ulong4)',
+                                   '(5555, 6666, 7777, 8888)'])
+
+        self._try_inspecting_local('double4_local',
+                                  ['(double4)',
+                                   '(-77, 88.5, -99, 111.5)'])
+    def test_inspect_matrix_types(self):
+        self._try_inspecting_local('matrix2x2_local',
+                                   ['(rs_matrix2x2)',
+                                    '= (m = (1, 2.5, 3, 4.5))'])
+
+        self._try_inspecting_local('matrix3x3_local',
+                                   ['(rs_matrix3x3)',
+                                    '= {\n'
+                                    '  m = ([0] = 5, [1] = 6.5, [2] = 7, [3] = 8.5, [4] = 9, [5] = 1.5, [6] = 2, [7] = 3.5, [8] = 4)'])
+
+    @wimpy
+    def test_inspect_matrix_4x4_local(self):
+        self._try_inspecting_local('matrix4x4_local',
+                                   ['(rs_matrix4x4)',
+                                    '= {\n'
+                                    '  m = {\n'
+                                    '    [0] = 5.5\n'
+                                    '    [1] = 6\n'
+                                    '    [2] = 7.5\n'
+                                    '    [3] = 8\n'
+                                    '    [4] = 9\n'
+                                    '    [5] = 1.5\n'
+                                    '    [6] = 2\n'
+                                    '    [7] = 3.5\n'
+                                    '    [8] = 4.5\n'
+                                    '    [9] = 5.5\n'
+                                    '    [10] = 6.5\n'
+                                    '    [11] = 7\n'
+                                    '    [12] = 8\n'
+                                    '    [13] = 9.5\n'
+                                    '    [14] = 1.5\n'
+                                    '    [15] = 2.5\n'
+                                    '  }\n'])
+
+    @wimpy
+    def test_inspect_quaternion_local(self):
+        self._try_inspecting_local('quaternion_local',
+                                  ['(rs_quaternion)',
+                                   '(8, 9, 0.5, 7.5)'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
+
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_reduction.py b/tests/lldb/tests/testcases/test_reduction.py
new file mode 100644
index 0000000..9653c02
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_reduction.py

@@ -0,0 +1,279 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+
+import itertools
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    wimpy,
+)
+from harness.assert_mixins import CoordinateAssertionsMixin
+
+from reduce_common import (
+    REDUCE_ITERATIONS,
+    REDUCE_STARTVAL,
+    REDUCE_SCRIPT,
+    X_TESTS,
+    Y_TESTS,
+    Z_TESTS,
+    ReductionMixin,
+)
+
+
+def coords_range_3d(x_range, y_range, z_range):
+    count = max((x_range, y_range, z_range))
+    x = itertools.cycle(range(x_range))
+    y = itertools.cycle(range(y_range))
+    z = itertools.cycle(range(z_range))
+    return itertools.islice(
+        itertools.izip(x, y, z),
+        count
+    )
+
+
+class TestReduce1DSingleThreaded(
+        TestBaseRemote, CoordinateAssertionsMixin, ReductionMixin):
+    """
+    Reduction kernels for RenderScript are launched using
+    a different `.expand` function than regular `ForEach` kernels and reflect a
+    different API to the invoking program
+
+    Although the debugger implementation for accessing these features tracks
+    this slightly differently for reduction kernels, the user interface should
+    still offer the basic functionality:
+        - breakpoints on a coordinate
+        - tracking, viewing and dumping allocations
+        - listing modules and constituent kernels and types
+    """
+
+    bundle_target = {
+        'java': 'Reduction',
+    }
+
+    def _delete_breakpoints(self):
+        try:
+            self.do_command('breakpoint delete -f')
+        except self.TestFail:
+            pass
+
+    def setup(self, android):
+        """This test requires to be run on one thread."""
+        android.push_prop('debug.rs.max-threads', 1)
+
+    def teardown(self, android):
+        """Reset the number of RS threads to the previous value."""
+        android.pop_prop('debug.rs.max-threads')
+
+    @ordered_test(0)
+    @wimpy
+    def test_setup(self):
+        self.try_command('language renderscript status', [])
+        self.try_command('b find_min_user_type_accum', [])
+        self.try_command('c', [])
+
+    @ordered_test(1)
+    @wimpy
+    def test_renderscript_module_dump(self):
+        """
+        Generalised Reduction kernels for RenderScript are not tracked in the
+        same way as `ForEach` kernels, and do not have `__attribute__((kernel))`
+        so we need to make sure that when a module contains reduction kernels,
+        `language renderscript module dump` in lldb prints the correct kernels.
+        """
+        self.try_command(
+            'language renderscript module dump',
+            [
+                'Reductions: 1',
+                'find_min_user_type',
+                'accumulator: find_min_user_type_accum',
+                'combiner: find_min_user_type_comb',
+                'outconverter: find_min_user_type_outc'
+            ]
+        )
+
+    @ordered_test(2)
+    @wimpy
+    def test_module_dump_with_foreach_kernel_separate(self):
+        """
+        The reduction breakpoint is separate from that of a standard kernel
+        function breakpoint, so we need to make sure that when we dump a module,
+        reductions are properly collected and displayed alongside the standard
+        __attribute__((kernel)) functions.
+        Assert that `... module dump` can correctly distinguish between `reduce`
+        kernels and `ForEach` kernels.
+        """
+        self.try_command(
+            'language renderscript module dump',
+            [
+                'Kernels: 2',
+                'Reductions: 1',
+                'accumulator: find_min_user_type_accum',
+                'initializer: find_min_user_type_init',
+                'combiner: find_min_user_type_comb',
+                'outconverter: find_min_user_type_outc'
+            ]
+        )
+
+    @wimpy
+    @ordered_test(3)
+    def test_reduction_breakpoint_set_all_roles_resolved(self):
+        """
+        Assert that a reduction breakpoint successfully resolves all the
+        functions that make up the reduction kernel
+        """
+        self.try_command(
+            'language renderscript reduction breakpoint set find_min_user_type',
+            ['Breakpoint(s) created']
+        )
+
+        self.try_command(
+            'process continue',
+            expected_regex=[
+                r'Process \d+ stopped',
+                r'librs.reduce.so`find_min_user_type',
+                r'stop reason = breakpoint'
+            ]
+        )
+        name = REDUCE_SCRIPT
+        self.try_command(
+            'breakpoint list',
+            expected_regex=[
+                "RenderScript reduce breakpoint for 'find_min_user_type', locations = 4, resolved = 4",
+                'where = librs.reduce.so`find_min_user_type_init (\+ \d+ )?at %s(.+, resolved,)' % name,
+                'where = librs.reduce.so`find_min_user_type_accum (\+ \d+ )?at %s(.+, resolved,)' % name,
+                'where = librs.reduce.so`find_min_user_type_comb (\+ \d+ )?at %s(.+, resolved,)' % name,
+                'where = librs.reduce.so`find_min_user_type_outc (\+ \d+ )?at %s(.+, resolved,)' % name,
+            ]
+        )
+
+    @ordered_test(4)
+    def test_reduce_iterations(self):
+        """
+        Given a reduction, we want to make sure that we break on
+        every accumulator invocation before seeing the outconverter called.
+        This requires the tests to be run single threaded
+        """
+        self._delete_breakpoints()
+        self.try_command(
+            'language renderscript reduction breakpoint set find_min_user_type -t initializer',
+        )
+        self.try_command(
+            'process continue',
+            expected_regex=[
+                r'Process \d+ stopped',
+                r'librs.reduce.so`find_min_user_type_init',
+                r'stop reason = breakpoint',
+            ]
+        )
+        self._delete_breakpoints()
+
+        self.try_command((
+            'language renderscript reduction breakpoint '
+            'set find_min_user_type --function-role accumulator,outconverter'),
+            ['Breakpoint(s) created']
+        )
+        for i in range(REDUCE_ITERATIONS):
+            self.try_command(
+                'process continue',
+                expected_regex=[
+                    r'Process \d+ resuming',
+                    r'Process \d+ stopped',
+                    r'librs.reduce.so`find_min_user_type_accum',
+                    r'stop reason = breakpoint'
+                ]
+            )
+            self.try_command('p val')
+            self.try_command(
+                'p val.b',
+                expected_regex=[
+                    r'^\((const )?int32_t\)\s*\$\d+ = %s\s*$' % (
+                        i + REDUCE_STARTVAL)
+                ]
+            )
+        # We should then finally break on the outconverter
+        self.try_command(
+            'process continue',
+            expected_regex=[
+                r'Process \d+ resuming',
+                r'Process \d+ stopped',
+                r'librs.reduce.so`find_min_user_type_outc',
+                r'stop reason = breakpoint'
+            ]
+        )
+
+    @ordered_test(5)
+    def test_function_role_breakpoints_combinations(self):
+        func_role_combinations = itertools.combinations(
+            ('accumulator', 'initializer'),
+            r=2
+        )
+        self._test_func_role_combinations(func_role_combinations)
+
+    @wimpy
+    @ordered_test(6)
+    def test_resolve_function_role_all_reduce_functions(self):
+        """
+        Assert that a reduction breakpoint successfully resolves all the
+        functions that make up the reduction kernel when the parameter `all` is
+        passed to `--function-role` for the breakpoint command
+        """
+        self._delete_breakpoints()
+        self.try_command(
+            'language renderscript reduction breakpoint set find_min_user_type -t all',
+            [r'Breakpoint(s) created']
+        )
+        self.try_command('c', [])
+        breakpoints_match = [
+            r"where = librs.reduce.so`%s (\+ \d+ )?at %s:\d+, address = 0x[0-9a-fA-F]+, resolved" % (
+                 'find_min_user_type_%s' % func_match,
+                 REDUCE_SCRIPT
+            )
+            for func_match in ('accum', 'init', 'comb', 'outc')
+        ]
+        self.try_command(
+            'breakpoint list',
+            expected_regex=[
+                r"Current breakpoints:",
+                r"RenderScript reduce breakpoint for 'find_min_user_type', locations = 4, resolved = 4",
+                r"Names:",
+                r"RenderScriptReduction",
+            ] + breakpoints_match
+        )
+
+    @ordered_test(8)
+    def test_reduce_breakpoint_conditional_1d_coordinate(self):
+        """
+        Assert that breakpoints conditional on an allocation coordinate
+        are only triggered on that coordinate
+        """
+        for x, _, __ in sorted(coords_range_3d(X_TESTS, Y_TESTS, Z_TESTS)):
+            self._delete_breakpoints()
+            self.assert_coord_bp_set(
+                'find_min_user_type -t accumulator',
+                x,
+                kernel_type='reduction'
+            )
+            self.assert_coord_stop('reduce', 'find_min_user_type', x)
+            # Step *into* the function so locals are available
+            # FIXME remove the need for `next` here; skip the function prologue
+            self.try_command('n')
+            self.try_command('p accum->a')
+            self.try_command('p accum->b')
+
+    @ordered_test('last')
+    def test_exit(self):
+        self.try_command('process kill', [])

diff --git a/tests/lldb/tests/testcases/test_reduction_combiner.py b/tests/lldb/tests/testcases/test_reduction_combiner.py
new file mode 100644
index 0000000..c070f3b
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_reduction_combiner.py

@@ -0,0 +1,134 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+
+import re
+import itertools
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    wimpy,
+    skip_conditional,
+)
+from harness.assert_mixins import CoordinateAssertionsMixin
+
+from reduce_common import (
+    REDUCE_SCRIPT,
+    REDUCE_AUTO_COMB_SCRIPT,
+    ReductionMixin,
+)
+
+
+multithreaded = lambda: skip_conditional(
+    lambda self: self.cpu_count == 1,
+    "skipping multithreaded test (1 CPU present)"
+)
+
+
+class TestReduceCombinerMultithreaded(
+        TestBaseRemote, CoordinateAssertionsMixin, ReductionMixin):
+    """
+    RenderScript reduction combiners are currently run only on the output of a
+    parallel reduction step for the CPU reference implementation.  These
+    testcases test LLDB's handling of breakpoints for the combiner function.
+    """
+
+    bundle_target = {
+        'java': 'Reduction',
+    }
+
+    def _delete_breakpoints(self):
+        try:
+            self.do_command('breakpoint delete -f')
+        except self.TestFail:
+            pass
+
+    def setup(self, android):
+        """
+        This test *must* be run on multiple threads, and is skipped if the
+        device does not support multiple threads
+        """
+        cpu_spec = android.shell("cat /sys/devices/system/cpu/online").strip()
+        match = re.search(r'(^0(-\d+)?(,\d+([-]\d*)?)*)$', cpu_spec)
+        if not match or not match.groups():
+            raise self.TestFail(
+                "unable to parse number of available CPUs in %r" % cpu_spec)
+
+        def parse_range(s):
+            r = s.split('-')
+            if len(r) == 1:
+                return 1
+            return int(r[1]) - int(r[0])
+
+        self.cpu_count = sum(map(parse_range, cpu_spec.split(',')))
+        android.push_prop('debug.rs.max-threads', self.cpu_count + 1)
+
+    def teardown(self, android):
+        """Reset the number of RS threads to the previous value."""
+        android.pop_prop('debug.rs.max-threads')
+
+    @multithreaded()
+    @ordered_test(0)
+    @wimpy
+    def test_setup(self):
+        self.try_command('language renderscript status', [])
+        # first point of order: make sure the compiled script is properly
+        # loaded and that we can set a breakpoint on the named reduction
+        self.try_command(
+            'language renderscript reduction breakpoint set find_min_user_type_auto_comb')
+        self.try_command(
+            'process continue',
+            expected_regex=[
+                r'Process \d+ stopped',
+                r'frame #0: (0x[0-9a-fA-F]+ )?librs.reduce_auto_comb.so`'
+            ]
+        )
+
+    @multithreaded()
+    def test_function_role_breakpoints_combinations(self):
+        func_role_combinations = itertools.combinations(
+            ('accumulator', 'outconverter', 'initializer', 'combiner'),
+            r=2
+        )
+        self._test_func_role_combinations(func_role_combinations)
+
+    @multithreaded()
+    def test_reduction_breakpoint_set_single_type_user_comb(self):
+        return self._reduction_breakpoint_set_single_type(
+            'reduce',
+            REDUCE_SCRIPT,
+            'find_min_user_type',
+            (
+                ('find_min_user_type_init', 'initializer'),
+                ('find_min_user_type_accum', 'accumulator'),
+                ('find_min_user_type_comb', 'combiner'),
+                ('find_min_user_type_outc', 'outconverter')
+            )
+        )
+
+    @multithreaded()
+    def test_reduction_breakpoint_set_single_type_auto_comb(self):
+        return self._reduction_breakpoint_set_single_type(
+            'reduce_auto_comb',
+            REDUCE_AUTO_COMB_SCRIPT,
+            'find_min_user_type_auto_comb',
+            (
+                ('find_min_user_type_init', 'initializer'),
+                ('find_min_user_type_accum', 'accumulator'),
+                ('find_min_user_type_accum.combiner', 'combiner'),
+                ('find_min_user_type_outc', 'outconverter')
+            )
+        )

diff --git a/tests/lldb/tests/testcases/test_rs_consts.py b/tests/lldb/tests/testcases/test_rs_consts.py
new file mode 100644
index 0000000..9a7ba70
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_rs_consts.py

@@ -0,0 +1,94 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestRSConsts.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    cpp_only_test,
+)
+
+
+class TestRSConsts(TestBaseRemote):
+    '''Tests examining the RenderScript constants.'''
+
+    bundle_target = {
+        'java': 'KernelVariables',
+        'jni': 'JNIKernelVariables',
+        'cpp': 'CppKernelVariables'
+    }
+
+    def test_rs_consts(self):
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('language renderscript kernel breakpoint set kernel',
+                         [])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+        # Constants
+        self.try_command('expr M_1_PI',
+                         ['0.318309'])
+
+        self.try_command('expr M_2_PI',
+                         ['0.636619'])
+
+        self.try_command('expr M_2_SQRTPI',
+                         ['1.128379'])
+
+        self.try_command('expr M_E',
+                         ['2.718281'])
+
+        self.try_command('expr M_LN10',
+                         ['2.302585'])
+
+        self.try_command('expr M_LN2',
+                         ['0.693147'])
+
+        self.try_command('expr M_LOG10E',
+                         ['0.434294'])
+
+        self.try_command('expr M_LOG2E',
+                         ['1.442695'])
+
+        self.try_command('expr M_PI',
+                         ['3.141592'])
+
+        self.try_command('expr M_PI_2',
+                         ['1.570796'])
+
+        self.try_command('expr M_PI_4',
+                         ['0.785398'])
+
+        self.try_command('expr M_SQRT1_2',
+                         ['0.707106'])
+
+        self.try_command('expr M_SQRT2',
+                         ['1.414213'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
+
+        self.try_command('process continue', ['exited with status = 0'])
+

diff --git a/tests/lldb/tests/testcases/test_script_group.py b/tests/lldb/tests/testcases/test_script_group.py
new file mode 100644
index 0000000..b3b3186
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_script_group.py

@@ -0,0 +1,109 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestScriptGroup.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import wimpy
+
+
+class TestScriptGroup(TestBaseRemote):
+    bundle_target = {
+        'java': 'ScriptGroup'
+    }
+
+    def setup(self, android):
+        '''This test requires to be run on one thread.'''
+        android.push_prop('debug.rs.max-threads', 1)
+
+    def teardown(self, android):
+        '''Reset the number of RS threads to the previous value.'''
+        android.pop_prop('debug.rs.max-threads')
+
+    @wimpy
+    def test_kernel_backtrace(self):
+        # number of allocation elements
+        array_size = 8
+
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered',
+                          'rsdDebugHintScriptGroup2'])
+
+        self.try_command('language renderscript scriptgroup breakpoint set scriptgroup_test',
+                         ['Breakpoint 1: no locations (pending)'])
+
+        self.try_command('language renderscript scriptgroup list',
+                         ['0 script groups'])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint',
+                          'librs.scriptgroup.so`foo',
+                          'scriptgroup.rs'])
+
+        self.try_command('breakpoint list',
+                         ['scriptgroup_test',
+                          'locations = 1'])
+
+        self.try_command('language renderscript scriptgroup list',
+                         ['1 script group',
+                          'scriptgroup_test',
+                          'foo',
+                          'goo'])
+
+        self.try_command('language renderscript scriptgroup breakpoint set --stop-on-all scriptgroup_test',
+                         ['Breakpoint 2: 2 locations'])
+
+        self.try_command('breakpoint list',
+                         ['scriptgroup_test',
+                          'librs.scriptgroup.so`foo',
+                          'librs.scriptgroup.so`goo'])
+
+        # iterate over foo kernels
+        self.try_command('bt',
+                         ['scriptgroup.rs:',
+                          'frame #0', 'librs.scriptgroup.so`foo',
+                          'frame #1', 'librs.scriptgroup.so`foo.expand'])
+
+        for x in range(array_size):
+            self.try_command('frame var',
+                             ['(int) a = {0}'.format(x)])
+            self.try_command('process continue',
+                             ['resuming',
+                              'stopped',
+                              'stop reason = breakpoint',
+                              'librs.scriptgroup.so`{0}'.format(
+                                  'foo' if x < 7 else 'goo')])
+
+        # iterate over goo kernels
+        self.try_command('bt',
+                         ['stop reason = breakpoint',
+                          'scriptgroup.rs:',
+                          'frame #0', 'librs.scriptgroup.so`goo',
+                          'frame #1', 'librs.scriptgroup.so`goo.expand'])
+
+        for x in range(array_size):
+            self.try_command('frame var',
+                             ['(int) a = {0}'.format(x * x)])
+
+            if x < 7:
+                self.try_command('process continue',
+                                 ['resuming',
+                                  'stopped',
+                                  'stop reason = breakpoint',
+                                  'librs.scriptgroup.so`goo'])

diff --git a/tests/lldb/tests/testcases/test_single_source.py b/tests/lldb/tests/testcases/test_single_source.py
new file mode 100644
index 0000000..5da1d97
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_single_source.py

@@ -0,0 +1,165 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestInvokeFun.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (ordered_test, wimpy)
+from harness.exception import TestSuiteException
+
+
+class TestSingleSource(TestBaseRemote):
+    '''Tests debugging a function executed from Java using invoke_*.'''
+
+    bundle_target = {
+        'java': "SingleSource"
+    }
+
+    def setup(self, android):
+
+        '''This test requires to be run on one thread.'''
+        android.push_prop('debug.rs.max-threads', 1)
+
+    def teardown(self, android):
+
+        '''Reset the number of RS threads to the previous value.'''
+        android.pop_prop('debug.rs.max-threads')
+
+    @ordered_test(-1)
+    @wimpy
+    def test_startup(self):
+
+        # pylint: disable=line-too-long
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('breakpoint set --name check_in',
+                         ['(pending)'])
+
+    @ordered_test(0)
+    @wimpy
+    def test_invoke_1(self):
+
+        # enter script_invoke_1
+        self.try_command('breakpoint set --name script_invoke_1',
+                         ['(pending)'])
+
+        self.try_command('process continue',
+                         ['stopped',
+                          'stop reason = breakpoint'],
+                         [r'librs.rs_single_source.so`script_invoke_1'])
+
+        self.try_command(
+            'language renderscript allocation dump 1',
+            ['(0, 0, 0) = 1',
+             '(1, 0, 0) = 2',
+             '(2, 0, 0) = 3',
+             '(3, 0, 0) = 4'])
+
+        self.try_command(
+            'language renderscript allocation dump 2',
+            ['(0, 0, 0) = 5',
+             '(1, 0, 0) = 6',
+             '(2, 0, 0) = 7',
+             '(3, 0, 0) = 8'])
+
+        self.try_command('breakpoint set --name `kernel_1',
+                         ['address'])
+
+        self.try_command('breakpoint set --name `kernel_2',
+                         ['address'])
+
+        # check our global allocation is visible
+        self.try_command('p global_alloc',
+                         ['(rs_allocation)',
+                          'p = 0x'])
+
+        # test kernel_1
+        for _ in range(10):
+            # continue as long as there are threads hitting kernel_1
+            out = self.do_command('process continue')
+            if 'librs.rs_single_source.so`kernel_1' in out:
+                continue
+            # if we hit check_in we have finished with kernel_1
+            if 'librs.rs_single_source.so`check_in' in out:
+                self.try_command(
+                    'language renderscript allocation dump 1',
+                    ['(0, 0, 0) = 25',
+                     '(1, 0, 0) = 36',
+                     '(2, 0, 0) = 49',
+                     '(3, 0, 0) = 64'])
+                break
+            TestSuiteException('unexpected breakpoint')
+        else:
+            TestSuiteException('loop quota exceeded')
+
+        # test kernel_2
+        for _ in range(10):
+            # continue as long as there are threads hitting kernel_2
+            out = self.do_command('process continue')
+            if 'librs.rs_single_source.so`kernel_2' in out:
+                continue
+            # if we hit check_in we have finished with kernel_2
+            if 'librs.rs_single_source.so`check_in' in out:
+                self.try_command(
+                    'language renderscript allocation dump 2',
+                    ['(0, 0, 0) = 125',
+                     '(1, 0, 0) = 216',
+                     '(2, 0, 0) = 343',
+                     '(3, 0, 0) = 512'])
+                break
+            TestSuiteException('unexpected breakpoint')
+        else:
+            TestSuiteException('loop quota exceeded')
+
+    @ordered_test(1)
+    @wimpy
+    def test_invoke_2(self):
+
+        # enter script_invoke_2
+        self.try_command('breakpoint set --name script_invoke_2',
+                         ['address'])
+
+        self.try_command('process continue',
+                         ['stopped',
+                          'stop reason = breakpoint'],
+                         [r'librs.rs_single_source.so`script_invoke_2'])
+
+        # test void_kernel_1
+        self.try_command('breakpoint set --name void_kernel_1',
+                         ['address'])
+
+        for _ in range(10):
+            out = self.do_command('process continue')
+
+            # continue as long as there are threads hitting void_kernel_1
+            if 'librs.rs_single_source.so`void_kernel_1' in out:
+                continue
+
+            # if we hit check_in we have finished with void_kernel_1
+            if 'librs.rs_single_source.so`check_in' in out:
+                self.try_command(
+                    'language renderscript allocation dump 4',
+                    ['(0, 0, 0) = 0',
+                     '(1, 0, 0) = 1',
+                     '(2, 0, 0) = 2',
+                     '(3, 0, 0) = 3'])
+                break
+
+            TestSuiteException('unexpected breakpoint')
+        else:
+            TestSuiteException('loop quota exceeded')

diff --git a/tests/lldb/tests/testcases/test_source_step.py b/tests/lldb/tests/testcases/test_source_step.py
new file mode 100644
index 0000000..c780b8a
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_source_step.py

@@ -0,0 +1,113 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestSourceStep.'''
+
+from __future__ import absolute_import
+
+import os
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    cpp_only_test,
+)
+
+
+class TestSourceStep(TestBaseRemote):
+    '''Test stepping through the source using step-in, -over and -out.'''
+
+    bundle_target = {
+        'java': 'BranchingFunCalls',
+        'jni': 'JNIBranchingFunCalls',
+        'cpp': 'CppBranchingFunCalls'
+
+    }
+
+    def script_dir(self):
+        file_dir = os.path.dirname(os.path.realpath(__file__))
+        app_root = os.path.join(file_dir, '..', '..')
+
+        return {
+            'java': os.path.join(app_root, 'java', 'BranchingFunCalls', 'src', 'rs'),
+            'cpp': os.path.join(app_root, 'cpp', 'BranchingFunCalls'),
+            'jni': os.path.join(app_root, 'jni', 'BranchingFunCalls', 'jnibranchingfuncalls')
+        }[self.app_type]
+
+    def setup(self, android):
+        '''This test requires to be run on one thread.'''
+        android.push_prop('debug.rs.max-threads', 1)
+
+    def teardown(self, android):
+        '''Reset the number of RS threads to the previous value.'''
+        android.pop_prop('debug.rs.max-threads')
+
+    def test_source_thread_step_in_out(self):
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('b -f scalars.rs -l 63',
+                         ['(pending)'])
+
+        self.try_command('process continue',
+                         ['stopped',
+                          'stop reason = breakpoint',
+                          'scalars.rs:63'])
+
+        # set the source mapping
+        self.set_src_map('scalars.rs', self.script_dir())
+
+        self.try_command('process status',
+                         ['-> 63',
+                          'int i = in;'])
+
+        #63     int i = in;
+        self.try_command('thread step-in',
+                         ['-> 64'])
+        #64     float f = (float) i;
+        self.try_command('thread step-in',
+                         ['-> 65'])
+        #49     modify_f(&f);
+        self.try_command('thread step-over',
+                         ['-> 66'])
+        #50  	modify_i(&i);
+        self.try_command('thread step-in',
+                         ['-> 49'])
+        #49         int j = *i;
+        self.try_command('b -f scalars.rs -l 54',
+                         ['librs.scalars.so`modify_i',
+                          'scalars.rs:54'])
+        self.try_command('c',
+                         ['stop reason = breakpoint',
+                          'scalars.rs:54',
+                          '-> 54'])
+        #54    set_i(i, 0);
+        # For the line number anything between #37 and #38 is fine
+        self.try_command('thread step-in',
+                         [],
+                         [r'-> 3[678]'])
+        #38    int tmp = b;
+        self.try_command('thread step-out',
+                         ['-> 54'])
+
+    @cpp_only_test()
+    @ordered_test('last')
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
+
+        self.try_command('breakpoint delete 2', ['1 breakpoints deleted'])
+
+        self.try_command('process continue',
+                         ['exited with status = 0'])
+

diff --git a/tests/lldb/tests/testcases/test_write_global.py b/tests/lldb/tests/testcases/test_write_global.py
new file mode 100644
index 0000000..1d8d301
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_write_global.py

@@ -0,0 +1,230 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestWriteGlobal.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    wimpy,
+    ordered_test,
+    cpp_only_test,
+)
+
+
+class TestWriteGlobal(TestBaseRemote):
+    '''Tests modifying global variables of all types.'''
+
+    bundle_target = {
+        'java': 'KernelVariables',
+        'jni': 'JNIKernelVariables',
+        'cpp': 'CppKernelVariables'
+    }
+
+    def _try_modifying_global(self, global_name, new_value, data_type_in,
+                             expected_output, expected_output_regex=None):
+        '''Modify and then inspect a global and check for the output.
+
+        Run the "expr" command to set a given global to a new value and
+        check that it is set afterwards by running the "target variable"
+        command.
+
+        Args:
+            global_name: String which is the name of the global to modify.
+            new_value: A string that is the new value of the global.
+            data_type_in: A string containing a c-style parenthesised data type
+                          representing the type of the global.
+            expected_output: List of strings that should be found in the output
+                             of both commands.
+            expected_output_regex: List of regular expressions that should be
+                                   found in the output of the target variable
+                                   command.
+
+        Raises:
+            TestFail: One of the lldb commands did not provide the expected
+                      output.
+        '''
+        self.try_command('expr %s = %s%s' %
+                         (global_name, data_type_in, new_value),
+                         expected_output,
+                         expected_output_regex)
+        self.try_command('target variable ' + global_name,
+                         expected_output,
+                         expected_output_regex)
+
+    @wimpy
+    @ordered_test(0)
+    def test_setup(self):
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('b -f simple.rs -l 145', [])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+    @wimpy
+    def test_char_global(self):
+        self._try_modifying_global('char_global', '-2',
+                                  '(signed char)', ['\'\\xfe\''],
+                                  [r'\((signed )?char\)'])
+
+    def test_write_primitive_types(self):
+        self._try_modifying_global('uchar_global', '22',
+                                  '(uchar)', ['(uchar)', '\'\\x16\''])
+
+        self._try_modifying_global('short_global', '-33',
+                                  '(short)', ['(short)', '-33'])
+
+        self._try_modifying_global('ushort_global', '44',
+                                  '(ushort)', ['(ushort)', '44'])
+
+        self._try_modifying_global('int_global', '-55',
+                                  '(int)', ['(int)', '-55'])
+
+        self._try_modifying_global('uint_global', '66',
+                                  '(uint)', ['(uint)', '66'])
+
+        self._try_modifying_global('float_global', '-7.5',
+                                  '(float)', ['(float)', '-7.5'])
+
+        self._try_modifying_global('long_global', '-888888',
+                                  '(long long)', ['-888888'],
+                                  [r'\((long )?long\)'])
+
+        self._try_modifying_global('ulong_global', '99999999',
+                                  '(ulong)', ['(ulong)', '99999999'])
+
+        self._try_modifying_global('double_global', '-10101.5',
+                                  '(double)', ['(double)', '-10101.5'])
+
+        self._try_modifying_global('char2_global', '{22, 4}',
+                                  '(char2)', ['(char2)', '(22, 4)'])
+
+    @wimpy
+    def test_write_uchar2(self):
+        self._try_modifying_global('uchar2_global', '{44, 55}',
+                                  '(uchar2)', ['(uchar2)', '(0x2c, 0x37)'])
+
+    def test_write_vec2(self):
+        self._try_modifying_global('short2_global', '{-66, 77}',
+                                  '(short2)', ['(short2)', '(-66, 77)'])
+
+        self._try_modifying_global('ushort2_global', '{88, 99}',
+                                  '(ushort2)', ['(ushort2)', '(88, 99)'])
+
+        self._try_modifying_global('int2_global', '{111, -222}',
+                                  '(int2)', ['(int2)', '(111, -222)'])
+
+        self._try_modifying_global('uint2_global', '{333, 444}',
+                                  '(uint2)', ['(uint2)', '(333, 444)'])
+
+        self._try_modifying_global('float2_global', '{-55.5f, 6.0}',
+                                  '(float2)', ['(float2)', '(-55.5, 6)'])
+
+        self._try_modifying_global('long2_global', '{666666, -777777}',
+                                  '(long2)', ['(long2)', '(666666, -777777)'])
+
+        self._try_modifying_global('ulong2_global', '{888888, 999999}',
+                                  '(ulong2)', ['(ulong2)', '(888888, 999999)'])
+
+        self._try_modifying_global('double2_global', '{11.0000000, -0.0l}',
+                                  '(double2)', ['(double2)', '(11, -0)'])
+
+        self._try_modifying_global('char3_global', '{2, -3, 4}',
+                                  '(char3)', ['(char3)', '(2, -3, 4,'])
+
+        self._try_modifying_global('uchar3_global', '{\'a\', \'b\', \'c\'}',
+                                  '(uchar3)', ['(uchar3)', '(0x61, 0x62, 0x63,'])
+
+    @wimpy
+    def test_write_short3(self):
+        self._try_modifying_global('short3_global', '{44, -55, 66}',
+                                  '(short3)', ['(short3)', '(44, -55, 66,'])
+
+    def test_write_vec3(self):
+        self._try_modifying_global('ushort3_global', '{88, 99, 111}',
+                                  '(ushort3)', ['(ushort3)', '(88, 99, 111,'])
+
+        self._try_modifying_global('int3_global', '{-111, 222, -333}',
+                                  '(int3)', ['(int3)', '(-111, 222, -333,'])
+
+        self._try_modifying_global('uint3_global', '{444, 555, 666}',
+                                  '(uint3)', ['(uint3)', '(444, 555, 666,'])
+
+        self._try_modifying_global('float3_global', '{7.5F, 0008.000, 9}',
+                                  '(float3)', ['(float3)', '(7.5, 8, 9,'])
+
+        self._try_modifying_global('long3_global', '{111111, -22222222, 3333333}',
+                                  '(long3)', ['(long3)', '(111111, -22222222, 3333333,'])
+
+        self._try_modifying_global('ulong3_global', '{4444444, 5555555, 66666666}',
+                                  '(ulong3)', ['(ulong3)', '(4444444, 5555555, 66666666,'])
+
+        self._try_modifying_global('double3_global', '{7.5L, -0, 8.9e1}',
+                                  '(double3)', ['(double3)', '(7.5, 0, 89,'])
+
+        self._try_modifying_global('char4_global', '{0x1, 0x2, 0x3, 0x4}',
+                                  '(char4)',
+                                  ['(char4)', '(1, 2, 3, 4)'])
+
+        self._try_modifying_global('uchar4_global', '{0x5, 0x6, 0x7, 0x8}',
+                                  '(uchar4)',
+                                  ['(uchar4)', '(0x05, 0x06, 0x07, 0x08)'])
+
+        self._try_modifying_global('short4_global', '{0x9, 0xa, 0xb, 0xc}',
+                                  '(short4)',
+                                  ['(short4)', '(9, 10, 11, 12)'])
+
+    @wimpy
+    def test_write_ushort4(self):
+        self._try_modifying_global('ushort4_global', '{0xd, 0xe, 0xf, 0x10}',
+                                  '(ushort4)',
+                                  ['(ushort4)', '(13, 14, 15, 16)'])
+
+    def test_write_vec4_global(self):
+        self._try_modifying_global('int4_global', '{0x11, 0x12, 0x13, 0x14}',
+                                  '(int4)',
+                                  ['(int4)', '(17, 18, 19, 20)'])
+
+        self._try_modifying_global('uint4_global', '{0x15, 0x16, 0x17, 0x18}',
+                                  '(uint4)',
+                                  ['(uint4)', '(21, 22, 23, 24)'])
+
+        self._try_modifying_global('float4_global', '{19.0, 20.5, -21, -22.5}',
+                                  '(float4)',
+                                  ['(float4)', '(19, 20.5, -21, -22.5)'])
+
+        self._try_modifying_global('long4_global', '{0x1d, 0x1e, 0x1f, 0x20}',
+                                  '(long4)',
+                                  ['(long4)', '(29, 30, 31, 32)'])
+
+        self._try_modifying_global('ulong4_global', '{0x21, 0x22, 0x23, 0x24}',
+                                  '(ulong4)',
+                                  ['(ulong4)', '(33, 34, 35, 36)'])
+
+        self._try_modifying_global('double4_global', '{25.000, -26, -27.5, 28.0}',
+                                  '(double4)',
+                                  ['(double4)', '(25, -26, -27.5, 28)'])
+
+    @ordered_test('last')
+    @cpp_only_test()
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
+
+        self.try_command('process continue', ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_write_global_element.py b/tests/lldb/tests/testcases/test_write_global_element.py
new file mode 100644
index 0000000..28882bd
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_write_global_element.py

@@ -0,0 +1,292 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestWriteGlobalElement.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    wimpy,
+    cpp_only_test
+)
+
+
+class TestWriteGlobalElement(TestBaseRemote):
+    '''Tests modifying elements of global variables of all types.'''
+
+    bundle_target = {
+        'java': 'KernelVariables',
+        'jni': 'JNIKernelVariables',
+        'cpp': 'CppKernelVariables'
+    }
+
+    def _try_inspecting_global(self, global_name, expected_output):
+        '''Run the "expr" command on a given global and with a given output.
+
+        Args:
+            global_name: String which is the name of the global to inspect.
+            expected_output: List of strings that should be found in the output.
+
+        Raises:
+            TestFail: The lldb command did not provide the expected output.
+        '''
+        self.try_command('expr ' + global_name, expected_output)
+
+    def _try_modifying_global(self, global_name, new_value, expected_output,
+                             expected_output_regex=None):
+        '''Modify and then inspect a global and check for the output.
+
+        Run the "expr" command to set a given global to a new value and
+        check that it is set afterwards by running the "target variable"
+        command.
+
+        Args:
+            global_name: String which is the name of the global to modify.
+            new_value: A string that is the new value of the global.
+            expected_output: List of strings that should be found in the output
+                             of both commands.
+            expected_output_regex: List of regular expressions that should be
+                                   found in the output of the target variable
+                                   command.
+
+        Raises:
+            TestFail: One of the lldb commands did not provide the expected
+                      output.
+        '''
+        self.try_command('expr %s = %s' % (global_name, new_value),
+                         expected_output,
+                         expected_output_regex)
+        self.try_command('target variable ' + global_name,
+                         expected_output,
+                         expected_output_regex)
+
+    @wimpy
+    @ordered_test(0)
+    def test_setup(self):
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('b -f simple.rs -l 145', [])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+    @wimpy
+    def test_modify_global(self):
+        self._try_modifying_global('char2_global[0]', '2',
+                                  ['\'\\x02\''],
+                                  [r'\((signed )?char\)'])
+        self._try_inspecting_global('char2_global', ['(char2)', '(2, -22)'])
+
+    def test_vec2(self):
+        self._try_modifying_global('uchar2_global[1]', '3',
+                                  ['\'\\x03\''],
+                                  [r'\(u(nsigned )?char\)'])
+        self._try_inspecting_global('uchar2_global',
+                                   ['(uchar2)', '(0x21, 0x03)'])
+
+        self._try_modifying_global('short2_global[0]', '-44',
+                                  ['(short)', '-44'])
+        self._try_inspecting_global('short2_global',
+                                   ['(short2)', '(-44, 666)'])
+
+        self._try_modifying_global('ushort2_global[1]', '55',
+                                  ['55'],
+                                  [r'\(u(nsigned )?short\)'])
+        self._try_inspecting_global('ushort2_global',
+                                   ['(ushort2)', '(777, 55)'])
+
+        self._try_modifying_global('int2_global[0]', '666',
+                                  ['(int)', '666'])
+        self._try_inspecting_global('int2_global',
+                                   ['(int2)', '(666, -1111)'])
+
+        self._try_modifying_global('uint2_global[1]', '777',
+                                  ['777'],
+                                  [r'\(u(nsigned )?int\)'])
+        self._try_inspecting_global('uint2_global',
+                                   ['(uint2)', '(2222, 777)'])
+
+        self._try_modifying_global('float2_global[0]', '-8.5',
+                                  ['(float)', '-8.5'])
+        self._try_inspecting_global('float2_global',
+                                   ['(float2)', '(-8.5, -5)'])
+
+        self._try_modifying_global('long2_global[1]', '999999',
+                                  ['999999'],
+                                  [r'\((long )?long\)'])
+        self._try_inspecting_global('long2_global',
+                                   ['(long2)', '(-4444, 999999)'])
+
+        self._try_modifying_global('ulong2_global[0]', '10101010101',
+                                  ['10101010101'],
+                                  [r'\(u(nsigned )?(long )?long\)'])
+        self._try_inspecting_global('ulong2_global',
+                                   ['(ulong2)', '(10101010101, 7777)'])
+
+        self._try_modifying_global('double2_global[1]', '-11.000',
+                                  ['(double)', '-11'])
+        self._try_inspecting_global('double2_global',
+                                   ['(double2)', '(88.5, -11)'])
+
+        self._try_modifying_global('char3_global[0]', '12',
+                                  ['\'\\f\''],
+                                  [r'\((signed )?char\)'])
+        self._try_inspecting_global('char3_global',
+                                   ['(char3)',
+                                    '(12, -22, -33,'])
+
+    @wimpy
+    def test_uchar3(self):
+        self._try_modifying_global('uchar3_global[1]', '\'d\'',
+                                  ['\'d\''],
+                                  [r'\(u(nsigned )?char\)'])
+        self._try_inspecting_global('uchar3_global',
+                                   ['(uchar3)',
+                                    '(0x21, 0x64, 0x37,'])
+
+    def test_vec3(self):
+        self._try_modifying_global('short3_global[2]', '-131',
+                                  ['(short)', '-131'])
+        self._try_inspecting_global('short3_global',
+                                   ['(short3)',
+                                    '(-555, 666, -131,'])
+
+        self._try_modifying_global('ushort3_global[0]', '1414',
+                                  ['1414'],
+                                  [r'\(u(nsigned )?short\)'])
+        self._try_inspecting_global('ushort3_global',
+                                   ['(ushort3)',
+                                    '(1414, 888, 999,'])
+
+        self._try_modifying_global('int3_global[0]', '151515',
+                                  ['(int)', '151515'])
+        self._try_inspecting_global('int3_global',
+                                   ['(int3)',
+                                    '(151515, -1111, 2222,'])
+
+        self._try_modifying_global('uint3_global[1]', '161616',
+                                  ['161616'],
+                                  [r'\(u(nsigned )?int\)'])
+        self._try_inspecting_global('uint3_global',
+                                   ['(uint3)',
+                                    '(2222, 161616, 4444,'])
+
+        self._try_modifying_global('float3_global[2]', '17.5',
+                                  ['(float)', '17.5'])
+        self._try_inspecting_global('float3_global',
+                                   ['(float3)',
+                                    '(4.5, -5, 17.5,'])
+
+        self._try_modifying_global('long3_global[0]', '-181818181818',
+                                  ['-181818181818'],
+                                  [r'\((long )?long\)'])
+        self._try_inspecting_global('long3_global',
+                                   ['(long3)',
+                                    '(-181818181818, 5555, 6666,'])
+
+        self._try_modifying_global('ulong3_global[1]', '191919191919',
+                                  ['191919191919'],
+                                  [r'\(u(nsigned )?(long )?long\)'])
+        self._try_inspecting_global('ulong3_global',
+                                   ['(ulong3)',
+                                    '(6666, 191919191919, 8888,'])
+
+        self._try_modifying_global('double3_global[2]', '20.5',
+                                  ['(double)', '20.5'])
+        self._try_inspecting_global('double3_global',
+                                   ['(double3)',
+                                    '(88.5, -99, 20.5,'])
+
+        self._try_modifying_global('char4_global[0]', '-21',
+                                  ['\'\\xeb\''],
+                                  [r'\((signed )?char\)'])
+        self._try_inspecting_global('char4_global',
+                                   ['(char4)',
+                                    '(-21, 11, -22, -33)'])
+
+        self._try_modifying_global('uchar4_global[1]', '22',
+                                  ['\'\\x16\''],
+                                  [r'\(u(nsigned )?char\)'])
+        self._try_inspecting_global('uchar4_global',
+                                   ['(uchar4)',
+                                    '(0xde, 0x16, 0x2c, 0x37)'])
+
+    @wimpy
+    def test_short4(self):
+        self._try_modifying_global('short4_global[2]', '23',
+                                  ['(short)', '23'])
+        self._try_inspecting_global('short4_global',
+                                   ['(short4)',
+                                    '(-444, -555, 23, 777)'])
+
+    def test_vec4(self):
+        self._try_modifying_global('ushort4_global[3]', '24',
+                                  ['24'],
+                                  [r'\(u(nsigned )?short\)'])
+        self._try_inspecting_global('ushort4_global',
+                                   ['(ushort4)',
+                                    '(666, 777, 888, 24)'])
+
+        self._try_modifying_global('int4_global[0]', '-2525',
+                                  ['(int)', '-2525'])
+        self._try_inspecting_global('int4_global',
+                                   ['(int4)',
+                                    '(-2525, 999, -1111, 2222)'])
+
+        self._try_modifying_global('uint4_global[1]', '26262',
+                                  ['26262'],
+                                  [r'\(u(nsigned )?int\)'])
+        self._try_inspecting_global('uint4_global',
+                                   ['(uint4)',
+                                    '(1111, 26262, 3333, 4444)'])
+
+        self._try_modifying_global('float4_global[2]', '27.0f',
+                                  ['(float)', '27'])
+        self._try_inspecting_global('float4_global',
+                                   ['(float4)',
+                                    '(3, 4.5, 27, -6.5)'])
+
+        self._try_modifying_global('long4_global[3]', '-28282828282',
+                                  ['-28282828282'],
+                                  [r'\((long )?long\)'])
+        self._try_inspecting_global('long4_global',
+                                   ['(long4)',
+                                    '(-3333, -4444, 5555, -28282828282)'])
+
+        self._try_modifying_global('ulong4_global[0]', '2929292929',
+                                  ['2929292929'],
+                                  [r'\(u(nsigned )?(long )?long\)'])
+        self._try_inspecting_global('ulong4_global',
+                                   ['(ulong4)',
+                                    '(2929292929, 6666, 7777, 8888)'])
+
+        self._try_modifying_global('double4_global[1]', '30.5',
+                                  ['(double)', '30.5'])
+        self._try_inspecting_global('double4_global',
+                                   ['(double4)',
+                                    '(-77, 30.5, -99, 111.5)'])
+
+    @cpp_only_test()
+    @ordered_test('last')
+    def test_cpp_cleanup(self):
+        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
+
+        self.try_command('process continue',
+                         ['exited with status = 0'])

diff --git a/tests/lldb/tests/testcases/test_write_local.py b/tests/lldb/tests/testcases/test_write_local.py
new file mode 100644
index 0000000..cd32ecc
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_write_local.py

@@ -0,0 +1,223 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestWriteLocal.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    ordered_test,
+    wimpy
+)
+
+
+class TestWriteLocal(TestBaseRemote):
+    '''Tests modifying local variables of all types.'''
+
+    bundle_target = {
+        'java': 'KernelVariables',
+        'jni': 'JNIKernelVariables',
+        'cpp': 'CppKernelVariables'
+    }
+
+    def _try_modifying_local(self, local_name, new_value, data_type_in,
+                             expected_output, expected_output_regex=None):
+        '''Try getting lldb to modify a local and check the output.
+
+        Run the "expr" command to set a given local to a new value and
+        check that it is set afterwards by running the "target variable"
+        command.
+
+        Args:
+            local_name: String which is the name of the local to modify.
+            new_value: A string that is the new value of the local.
+            data_type_in: A string containing a c-style parenthesised data type
+                          representing the type of the local.
+            expected_output: List of strings that should be found in the output
+                             of both commands.
+            expected_output_regex: List of regular expressions that should be
+                                   found in the output of the target variable
+                                   command.
+
+        Raises:
+            TestFail: One of the lldb commands did not provide the expected
+                      output.
+        '''
+        # pylint: disable=too-many-arguments
+        self.try_command('expr %s = %s%s'
+                         % (local_name, data_type_in, new_value),
+                         expected_output,
+                         expected_output_regex)
+        self.try_command('frame variable ' + local_name,
+                         expected_output,
+                         expected_output_regex)
+
+    @wimpy
+    @ordered_test(0)
+    def test_setup(self):
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('b -f simple.rs -l 145', [])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+    @wimpy
+    def test_modify_char(self):
+        self._try_modifying_local('char_local', '-2',
+                                 '(signed char)', ['\'\\xfe\''],
+                                 [r'\((signed )?char\)'])
+
+    def test_modify_primitive(self):
+        self._try_modifying_local('uchar_local', '22',
+                                 '(uchar)', ['(uchar)', '\'\\x16\''])
+
+        self._try_modifying_local('short_local', '-33',
+                                 '(short)', ['(short)', '-33'])
+
+        self._try_modifying_local('ushort_local', '44',
+                                 '(ushort)', ['(ushort)', '44'])
+
+        self._try_modifying_local('int_local', '-55',
+                                 '(int)', ['(int)', '-55'])
+
+        self._try_modifying_local('uint_local', '66',
+                                 '(uint)', ['(uint)', '66'])
+
+        self._try_modifying_local('float_local', '-7.5',
+                                 '(float)', ['(float)', '-7.5'])
+
+        self._try_modifying_local('long_local', '-888888',
+                                 '(long long)', ['-888888'],
+                                 [r'\((long )?long\)'])
+
+        self._try_modifying_local('ulong_local', '99999999',
+                                 '(ulong)', ['(ulong)', '99999999'])
+
+        self._try_modifying_local('double_local', '-10101.5',
+                                 '(double)', ['(double)', '-10101.5'])
+
+        self._try_modifying_local('char2_local', '{22, 4}',
+                                 '(char2)', ['(char2)', '(22, 4)'])
+
+    @wimpy
+    def test_modify_uchar2(self):
+        self._try_modifying_local('uchar2_local', '{44, 55}',
+                             '(uchar2)', ['(uchar2)', '(0x2c, 0x37)'])
+
+    def test_modify_vec2(self):
+        self._try_modifying_local('short2_local', '{-66, 77}',
+                                 '(short2)', ['(short2)', '(-66, 77)'])
+
+        self._try_modifying_local('ushort2_local', '{88, 99}',
+                                 '(ushort2)', ['(ushort2)', '(88, 99)'])
+
+        self._try_modifying_local('int2_local', '{111, -222}',
+                                 '(int2)', ['(int2)', '(111, -222)'])
+
+        self._try_modifying_local('uint2_local', '{333, 444}',
+                                 '(uint2)', ['(uint2)', '(333, 444)'])
+
+        self._try_modifying_local('float2_local', '{-55.5f, 6.0}',
+                                 '(float2)', ['(float2)', '(-55.5, 6)'])
+
+        self._try_modifying_local('long2_local', '{666666, -777777}',
+                                 '(long2)', ['(long2)', '(666666, -777777)'])
+
+        self._try_modifying_local('ulong2_local', '{888888, 999999}',
+                                 '(ulong2)', ['(ulong2)', '(888888, 999999)'])
+
+        self._try_modifying_local('double2_local', '{11.0000000, -0.0l}',
+                                 '(double2)', ['(double2)', '(11, -0)'])
+
+        self._try_modifying_local('char3_local', '{2, -3, 4}',
+                                 '(char3)', ['(char3)', '(2, -3, 4,'])
+
+        self._try_modifying_local('uchar3_local', '{\'a\', \'b\', \'c\'}',
+                                 '(uchar3)', ['(uchar3)', '(0x61, 0x62, 0x63,'])
+
+    @wimpy
+    def test_modify_short3(self):
+        self._try_modifying_local('short3_local', '{44, -55, 66}',
+                             '(short3)', ['(short3)', '(44, -55, 66,'])
+
+    def test_modify_vec3(self):
+        self._try_modifying_local('ushort3_local', '{88, 99, 111}',
+                                 '(ushort3)', ['(ushort3)', '(88, 99, 111,'])
+
+        self._try_modifying_local('int3_local', '{-111, 222, -333}',
+                                 '(int3)', ['(int3)', '(-111, 222, -333,'])
+
+        self._try_modifying_local('uint3_local', '{444, 555, 666}',
+                                 '(uint3)', ['(uint3)', '(444, 555, 666,'])
+
+        self._try_modifying_local('float3_local', '{7.5F, 0008.000, 9}',
+                                 '(float3)', ['(float3)', '(7.5, 8, 9,'])
+
+        self._try_modifying_local('long3_local', '{111111, -22222222, 3333333}',
+                                 '(long3)', ['(long3)', '(111111, -22222222, 3333333,'])
+
+        self._try_modifying_local('ulong3_local', '{4444444, 5555555, 66666666}',
+                                 '(ulong3)', ['(ulong3)', '(4444444, 5555555, 66666666,'])
+
+        self._try_modifying_local('double3_local', '{7.5L, -0, 8.9e1}',
+                                 '(double3)', ['(double3)', '(7.5, 0, 89,'])
+
+        self._try_modifying_local('char4_local', '{0x1, 0x2, 0x3, 0x4}',
+                                 '(char4)',
+                                 ['(char4)', '(1, 2, 3, 4)'])
+
+        self._try_modifying_local('uchar4_local', '{0x5, 0x6, 0x7, 0x8}',
+                                 '(uchar4)',
+                                 ['(uchar4)', '(0x05, 0x06, 0x07, 0x08)'])
+
+        self._try_modifying_local('short4_local', '{0x9, 0xa, 0xb, 0xc}',
+                                 '(short4)',
+                                 ['(short4)', '(9, 10, 11, 12)'])
+
+    @wimpy
+    def test_modify_ushort4(self):
+        self._try_modifying_local('ushort4_local', '{0xd, 0xe, 0xf, 0x10}',
+                             '(ushort4)',
+                             ['(ushort4)', '(13, 14, 15, 16)'])
+
+    def test_modify_vec4(self):
+        self._try_modifying_local('int4_local', '{0x11, 0x12, 0x13, 0x14}',
+                                 '(int4)',
+                                 ['(int4)', '(17, 18, 19, 20)'])
+
+        self._try_modifying_local('uint4_local', '{0x15, 0x16, 0x17, 0x18}',
+                                 '(uint4)',
+                                 ['(uint4)', '(21, 22, 23, 24)'])
+
+        self._try_modifying_local('float4_local', '{19.0, 20.5, -21, -22.5}',
+                                 '(float4)',
+                                 ['(float4)', '(19, 20.5, -21, -22.5)'])
+
+        self._try_modifying_local('long4_local', '{0x1d, 0x1e, 0x1f, 0x20}',
+                                 '(long4)',
+                                 ['(long4)', '(29, 30, 31, 32)'])
+
+        self._try_modifying_local('ulong4_local', '{0x21, 0x22, 0x23, 0x24}',
+                                 '(ulong4)',
+                                 ['(ulong4)', '(33, 34, 35, 36)'])
+
+        self._try_modifying_local('double4_local', '{25.000, -26, -27.5, 28.0}',
+                                 '(double4)',
+                                 ['(double4)', '(25, -26, -27.5, 28)'])

diff --git a/tests/lldb/tests/testcases/test_write_local_element.py b/tests/lldb/tests/testcases/test_write_local_element.py
new file mode 100644
index 0000000..17e6121
--- /dev/null
+++ b/tests/lldb/tests/testcases/test_write_local_element.py

@@ -0,0 +1,295 @@
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''Module that contains the test TestWriteLocalElement.'''
+
+from __future__ import absolute_import
+
+from harness.test_base_remote import TestBaseRemote
+from harness.decorators import (
+    wimpy,
+    ordered_test
+)
+
+
+class TestWriteLocalElement(TestBaseRemote):
+    '''Tests modifying elements of local variables of all types.'''
+
+    bundle_target = {
+        'java': 'KernelVariables',
+        'jni': 'JNIKernelVariables',
+        'cpp': 'CppKernelVariables'
+    }
+
+    def _try_inspecting_local(self, local_name, expected_output):
+        '''Run the "expr" command on a given local and with a given output.
+
+        Args:
+            local_name: String which is the name of the local to inspect.
+            expected_output: List of strings that should be found in the output.
+
+        Raises:
+            TestFail: The lldb command did not provide the expected output.
+        '''
+        self.try_command('expr ' + local_name, expected_output)
+
+    def _try_modifying_local(self, local_name, new_value, expected_output,
+                             expected_output_regex=None):
+        '''Modify and then inspect a local and check for the output.
+
+        Run the "expr" command to set a given local to a new value and
+        check that it is set afterwards by running the "target variable"
+        command.
+
+        Args:
+            local_name: String which is the name of the local to modify.
+            new_value: A string that is the new value of the local.
+            expected_output: List of strings that should be found in the output
+                             of both commands.
+            expected_output_regex: List of regular expressions that should be
+                                   found in the output of the target variable
+                                   command.
+
+        Raises:
+            TestFail: One of the lldb commands did not provide the expected
+                      output.
+        '''
+        self.try_command('expr %s = %s' % (local_name, new_value),
+                         expected_output,
+                         expected_output_regex)
+        self.try_command('frame variable ' + local_name,
+                         expected_output,
+                         expected_output_regex)
+
+    @wimpy
+    @ordered_test(0)
+    def test_setup(self):
+        self.try_command('language renderscript status',
+                         ['Runtime Library discovered',
+                          'Runtime Driver discovered'])
+
+        self.try_command('b -f simple.rs -l 145', [])
+
+        self.try_command('process continue',
+                         ['resuming',
+                          'stopped',
+                          'stop reason = breakpoint'])
+
+    @wimpy
+    def test_modify_char2(self):
+        self._try_modifying_local('char2_local[0]', '2',
+                                 ['2'], [r'\((signed )?char\)'])
+        self._try_inspecting_local('char2_local',
+                                 ['(char2)', '(2, -22)'])
+
+    def test_modify_vec2(self):
+        self._try_modifying_local('uchar2_local[1]', '3',
+                                 ['3'], [r'\(u(nsigned )?char\)'])
+        self._try_inspecting_local('uchar2_local',
+                                  ['(uchar2)', '(0x21, 0x03)'])
+
+        self._try_modifying_local('short2_local[0]', '-44',
+                                 ['(short)', '-44'])
+        self._try_inspecting_local('short2_local',
+                                  ['(short2)', '(-44, 666)'])
+
+        self._try_modifying_local('ushort2_local[1]', '55',
+                                 ['55'], [r'\(u(nsigned )?short\)'])
+        self._try_inspecting_local('ushort2_local',
+                                  ['(ushort2)', '(777, 55)'])
+
+        self._try_modifying_local('int2_local[0]', '666',
+                                 ['(int)', '666'])
+        self._try_inspecting_local('int2_local',
+                                  ['(int2)', '(666, -1111)'])
+
+        self._try_modifying_local('uint2_local[1]', '777',
+                                 ['777'], [r'\(u(nsigned )?int\)'])
+        self._try_inspecting_local('uint2_local',
+                                  ['(uint2)', '(2222, 777)'])
+
+        self._try_modifying_local('float2_local[0]', '-8.5',
+                                 ['(float)', '-8.5'])
+        self._try_inspecting_local('float2_local',
+                                  ['(float2)', '(-8.5, -5)'])
+
+        self._try_modifying_local('long2_local[1]', '999999',
+                                 ['999999'],
+                                 [r'\((long )?long\)'])
+        self._try_inspecting_local('long2_local',
+                                  ['(long2)', '(-4444, 999999)'])
+
+        self._try_modifying_local('ulong2_local[0]', '10101010101',
+                                 ['10101010101'],
+                                 [r'\(u(nsigned )?(long )?long\)'])
+        self._try_inspecting_local('ulong2_local',
+                                  ['(ulong2)', '(10101010101, 7777)'])
+
+        self._try_modifying_local('double2_local[1]', '-11.000',
+                                 ['(double)', '-11'])
+        self._try_inspecting_local('double2_local',
+                                  ['(double2)', '(88.5, -11)'])
+
+        # For some reason the result of some char and uchar expr is in hex
+        # and that of frame variable in decimal, so calling
+        # try_modifying_local doesn't work, because it reuses the expected
+        # output for both commands.
+        self.try_command('expr char3_local[0] = 12',
+                         ['\'\\f\''],
+                         [r'\((signed )?char\)'])
+        self.try_command('frame variable char3_local[0]',
+                         ['12'],
+                         [r'\((signed )?char\)'])
+
+        self._try_inspecting_local('char3_local',
+                                  ['(char3)',
+                                   '(12, -22, -33,'])
+
+    @wimpy
+    def test_modify_uchar3(self):
+        self.try_command('expr uchar3_local[1] = \'d\'',
+                         ['\'d\''],
+                         [r'\(u(nsigned )?char\)'])
+        self.try_command('frame variable uchar3_local[1]',
+                         ['0x64'],
+                         [r'\(u(nsigned )?char\)'])
+
+
+    def test_modify_vec3(self):
+        self._try_inspecting_local('uchar3_local',
+                                  ['(uchar3)',
+                                   '(0x21, 0x64, 0x37,'])
+
+        self._try_modifying_local('short3_local[2]', '-131',
+                                 ['(short)', '-131'])
+        self._try_inspecting_local('short3_local',
+                                  ['(short3)',
+                                   '(-555, 666, -131,'])
+
+        self._try_modifying_local('ushort3_local[0]', '1414',
+                                 ['1414'], [r'\(u(nsigned )?short\)'])
+        self._try_inspecting_local('ushort3_local',
+                                  ['(ushort3)',
+                                   '(1414, 888, 999,'])
+
+        self._try_modifying_local('int3_local[0]', '151515',
+                                 ['(int)', '151515'])
+        self._try_inspecting_local('int3_local',
+                                  ['(int3)',
+                                   '(151515, -1111, 2222,'])
+
+        self._try_modifying_local('uint3_local[1]', '161616',
+                                 ['161616'], [r'\(u(nsigned )?int\)'])
+        self._try_inspecting_local('uint3_local',
+                                  ['(uint3)',
+                                   '(2222, 161616, 4444,'])
+
+        self._try_modifying_local('float3_local[2]', '17.5',
+                                 ['(float)', '17.5'])
+        self._try_inspecting_local('float3_local',
+                                  ['(float3)',
+                                   '(4.5, -5, 17.5,'])
+
+        self._try_modifying_local('long3_local[0]', '-181818181818',
+                                 ['-181818181818'], [r'\((long )?long\)'])
+        self._try_inspecting_local('long3_local',
+                                  ['(long3)',
+                                   '(-181818181818, 5555, 6666,'])
+
+        self._try_modifying_local('ulong3_local[1]', '191919191919',
+                                 ['191919191919'],
+                                 [r'\(u(nsigned )?(long )?long\)'])
+        self._try_inspecting_local('ulong3_local',
+                                  ['(ulong3)',
+                                   '(6666, 191919191919, 8888,'])
+
+        self._try_modifying_local('double3_local[2]', '20.5',
+                                 ['(double)', '20.5'])
+        self._try_inspecting_local('double3_local',
+                                  ['(double3)',
+                                   '(88.5, -99, 20.5,'])
+
+        self.try_command('expr char4_local[0] = -21',
+                         ['\'\\xeb\''],
+                         [r'\((signed )?char\)'])
+        self.try_command('frame variable char4_local[0]',
+                         ['-21'],
+                         [r'\((signed )?char\)'])
+
+        self._try_inspecting_local('char4_local',
+                                  ['(char4)',
+                                   '(-21, 11, -22, -33)'])
+
+        self.try_command('expr uchar4_local[1] = 22',
+                         ['\'\\x16\''],
+                         [r'\(u(nsigned )?char\)'])
+        self.try_command('frame variable uchar4_local[1]',
+                         ['0x16'],
+                         [r'\(u(nsigned )?char\)'])
+
+        self._try_inspecting_local('uchar4_local',
+                                  ['(uchar4)',
+                                   '(0x16, 0x16, 0x2c, 0x37)'])
+
+    @wimpy
+    def test_modify_short4(self):
+        self._try_modifying_local('short4_local[2]', '23',
+                                 ['(short)', '23'])
+        self._try_inspecting_local('short4_local',
+                                  ['(short4)',
+                                   '(-444, -555, 23, 777)'])
+
+    def test_modify_vec4(self):
+        self._try_modifying_local('ushort4_local[3]', '24',
+                                 ['24'], [r'\(u(nsigned )?short\)'])
+        self._try_inspecting_local('ushort4_local',
+                                  ['(ushort4)',
+                                   '(666, 777, 888, 24)'])
+
+        self._try_modifying_local('int4_local[0]', '-2525',
+                                 ['(int)', '-2525'])
+        self._try_inspecting_local('int4_local',
+                                  ['(int4)',
+                                   '(-2525, 999, -1111, 2222)'])
+
+        self._try_modifying_local('uint4_local[1]', '26262',
+                                 ['26262'], [r'\(u(nsigned )?int\)'])
+        self._try_inspecting_local('uint4_local',
+                                  ['(uint4)',
+                                   '(1111, 26262, 3333, 4444)'])
+
+        self._try_modifying_local('float4_local[2]', '27.0f',
+                                 ['(float)', '27'])
+        self._try_inspecting_local('float4_local',
+                                  ['(float4)',
+                                   '(3, 4.5, 27, -6.5)'])
+
+        self._try_modifying_local('long4_local[3]', '-28282828282',
+                                 ['-28282828282'], [r'\((long )?long\)'])
+        self._try_inspecting_local('long4_local',
+                                  ['(long4)',
+                                   '(-3333, -4444, 5555, -28282828282)'])
+
+        self._try_modifying_local('ulong4_local[0]', '2929292929',
+                                 ['2929292929'],
+                                 [r'\(u(nsigned )?(long )?long\)'])
+        self._try_inspecting_local('ulong4_local',
+                                  ['(ulong4)',
+                                   '(2929292929, 6666, 7777, 8888)'])
+
+        self._try_modifying_local('double4_local[1]', '30.5',
+                                 ['(double)', '30.5'])
+        self._try_inspecting_local('double4_local',
+                                  ['(double4)',
+                                   '(-77, 30.5, -99, 111.5)'])

diff --git a/toolkit/Android.bp b/toolkit/Android.bp
deleted file mode 100644
index d3fa21e..0000000
--- a/toolkit/Android.bp
+++ /dev/null

@@ -1,135 +0,0 @@
-package {
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-// TODO: In later CLs, this build file will be replaced by a stand alone build that's not part of Android.
-
-cc_binary {
-    name: "renderscripttoolkittest",
-    srcs: [
-        "TestTaskProcessor.cpp"
-    ],
-    shared_libs: [
-         "libbase",
-         "librenderscripttoolkit",
-    ],
-}
-
-cc_library_shared {
-    name: "librenderscripttoolkit",
-    defaults: [],
-    vendor_available: false,
-    native_bridge_supported: false,
-    vndk: {
-        enabled: false,
-        support_system_process: false,
-    },
-
-    srcs: [
-        "Blend.cpp",
-        "Blur.cpp",
-        "ColorMatrix.cpp",
-        "Convolve3x3.cpp",
-        "Convolve5x5.cpp",
-        "Histogram.cpp",
-        "Lut.cpp",
-        "Lut3d.cpp",
-	"RenderScriptToolkit.cpp",
-        "Resize.cpp",
-        "TaskProcessor.cpp",
-        "Utils.cpp",
-        "YuvToRgb.cpp",
-    ],
-
-    static_libs: [ "cpufeatures" ],
-
-    arch: {
-        arm64: {
-            cflags: [
-                "-DARCH_ARM_USE_INTRINSICS",
-                "-DARCH_ARM64_USE_INTRINSICS",
-                "-DARCH_ARM64_HAVE_NEON",
-            ],
-
-            srcs: [
-                "Blend_advsimd.S",
-                "Blur_advsimd.S",
-                "ColorMatrix_advsimd.S",
-                "Convolve_advsimd.S",
-                "Lut3d_advsimd.S",
-                "Resize_advsimd.S",
-                "YuvToRgb_advsimd.S",
-            ],
-        },
-
-        arm: {
-            cflags: [
-                "-DARCH_ARM_HAVE_VFP",
-                "-DARCH_ARM_USE_INTRINSICS",
-            ],
-
-            srcs: [
-                "Blend_neon.S",
-                "Blur_neon.S",
-                "ColorMatrix_neon.S",
-                "Convolve_neon.S",
-                "Lut3d_neon.S",
-                "Resize_neon.S",
-                "YuvToRgb_neon.S",
-            ],
-
-            asflags: ["-mfpu=neon"],
-
-            neon: {
-                cflags: [
-                    "-DARCH_ARM_HAVE_NEON",
-                ],
-            },
-        },
-
-        x86: {
-            cflags: ["-DARCH_X86_HAVE_SSSE3"],
-            srcs: ["x86.cpp"],
-        },
-        x86_64: {
-            cflags: ["-DARCH_X86_HAVE_SSSE3"],
-            srcs: ["x86.cpp"],
-        avx2: {
-                cflags: ["-DARCH_X86_HAVE_AVX2", "-mavx2", "-mfma"],
-            },
-        },
-    },
-
-    shared_libs: [
-        "libbase",
-        "liblog",
-	"libnativehelper",
-	"libjnigraphics",
-    ],
-    header_libs: [
-        // TODO Once we compile in the .cpp files, check if any of these libraries are needed.
-        //"libutils_headers",
-        //"libhardware_headers",
-    ],
-
-    include_dirs: [
-    ],
-
-    cflags: [
-        "-Wthread-safety",
-        "-Werror",
-        "-Wall",
-        "-Wextra",
-        "-Wno-unused-parameter",
-        "-Wno-unused-variable",
-    ],
-
-    // TODO: Is this needed?
-    product_variables: {
-        pdk: {
-            // Not building RenderScript modules in PDK builds, as libmediandk
-            // is not available in PDK.
-            enabled: false,
-        },
-    },
-}

diff --git a/toolkit/Blend.cpp b/toolkit/Blend.cpp
deleted file mode 100644
index 1f6319e..0000000
--- a/toolkit/Blend.cpp
+++ /dev/null

@@ -1,370 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-
-#include <cstdint>
-
-#include "RenderScriptToolkit.h"
-#include "TaskProcessor.h"
-#include "Utils.h"
-
-namespace android {
-namespace renderscript {
-
-#define LOG_TAG "renderscript.toolkit.Blend"
-
-/**
- * Blends a source into a destination, based on the mode.
- */
-class BlendTask : public Task {
-    // The type of blending to do.
-    RenderScriptToolkit::BlendingMode mMode;
-    // The input we're blending.
-    const uchar4* mIn;
-    // The destination, used both for input and output.
-    uchar4* mOut;
-
-    void blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out,
-               uint32_t length);
-    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
-    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                             size_t endY) override;
-
-   public:
-    BlendTask(RenderScriptToolkit::BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX,
-              size_t sizeY, const Restriction* restriction)
-        : Task{sizeX, sizeY, 4, true, restriction},
-          mMode{mode},
-          mIn{reinterpret_cast<const uchar4*>(in)},
-          mOut{reinterpret_cast<uchar4*>(out)} {}
-};
-
-#if defined(ARCH_ARM_USE_INTRINSICS)
-extern "C" int rsdIntrinsicBlend_K(uchar4 *out, uchar4 const *in, int slot,
-                    uint32_t xstart, uint32_t xend);
-#endif
-
-#if defined(ARCH_X86_HAVE_SSSE3)
-extern void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8);
-extern void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8);
-extern void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8);
-extern void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8);
-extern void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8);
-extern void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8);
-extern void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8);
-extern void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8);
-extern void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8);
-extern void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8);
-extern void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8);
-extern void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
-#endif
-
-// Convert vector to uchar4, clipping each value to 255.
-template <typename TI>
-static inline uchar4 convertClipped(TI amount) {
-    return uchar4 { static_cast<uchar>(amount.x > 255 ? 255 : amount.x),
-                    static_cast<uchar>(amount.y > 255 ? 255 : amount.y),
-                    static_cast<uchar>(amount.z > 255 ? 255 : amount.z),
-                    static_cast<uchar>(amount.w > 255 ? 255 : amount.w)};
-}
-
-void BlendTask::blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out,
-                      uint32_t length) {
-    uint32_t x1 = 0;
-    uint32_t x2 = length;
-
-#if defined(ARCH_ARM_USE_INTRINSICS)
-    if (mUsesSimd) {
-        if (rsdIntrinsicBlend_K(out, in, (int) mode, x1, x2) >= 0) {
-            return;
-        } else {
-            ALOGW("Intrinsic Blend failed to use SIMD for %d", mode);
-        }
-    }
-#endif
-    switch (mode) {
-    case RenderScriptToolkit::BlendingMode::CLEAR:
-        for (;x1 < x2; x1++, out++) {
-            *out = 0;
-        }
-        break;
-    case RenderScriptToolkit::BlendingMode::SRC:
-        for (;x1 < x2; x1++, out++, in++) {
-          *out = *in;
-        }
-        break;
-    //RenderScriptToolkit::BlendingMode::DST is a NOP
-    case RenderScriptToolkit::BlendingMode::DST:
-        break;
-    case RenderScriptToolkit::BlendingMode::SRC_OVER:
-    #if defined(ARCH_X86_HAVE_SSSE3)
-        if (mUsesSimd) {
-            if ((x1 + 8) < x2) {
-                uint32_t len = (x2 - x1) >> 3;
-                rsdIntrinsicBlendSrcOver_K(out, in, len);
-                x1 += len << 3;
-                out += len << 3;
-                in += len << 3;
-            }
-        }
-    #endif
-        for (;x1 < x2; x1++, out++, in++) {
-            ushort4 in_s = convert<ushort4>(*in);
-            ushort4 out_s = convert<ushort4>(*out);
-            in_s = in_s + ((out_s * (ushort4)(255 - in_s.w)) >> (ushort4)8);
-            *out = convertClipped(in_s);
-        }
-        break;
-    case RenderScriptToolkit::BlendingMode::DST_OVER:
-    #if defined(ARCH_X86_HAVE_SSSE3)
-        if (mUsesSimd) {
-            if ((x1 + 8) < x2) {
-                uint32_t len = (x2 - x1) >> 3;
-                rsdIntrinsicBlendDstOver_K(out, in, len);
-                x1 += len << 3;
-                out += len << 3;
-                in += len << 3;
-            }
-        }
-     #endif
-        for (;x1 < x2; x1++, out++, in++) {
-            ushort4 in_s = convert<ushort4>(*in);
-            ushort4 out_s = convert<ushort4>(*out);
-            in_s = out_s + ((in_s * (ushort4)(255 - out_s.w)) >> (ushort4)8);
-            *out = convertClipped(in_s);
-        }
-        break;
-    case RenderScriptToolkit::BlendingMode::SRC_IN:
-    #if defined(ARCH_X86_HAVE_SSSE3)
-        if (mUsesSimd) {
-            if ((x1 + 8) < x2) {
-                uint32_t len = (x2 - x1) >> 3;
-                rsdIntrinsicBlendSrcIn_K(out, in, len);
-                x1 += len << 3;
-                out += len << 3;
-                in += len << 3;
-            }
-        }
-#endif
-        for (;x1 < x2; x1++, out++, in++) {
-            ushort4 in_s = convert<ushort4>(*in);
-            in_s = (in_s * out->w) >> (ushort4)8;
-            *out = convert<uchar4>(in_s);
-        }
-        break;
-    case RenderScriptToolkit::BlendingMode::DST_IN:
-    #if defined(ARCH_X86_HAVE_SSSE3)
-        if (mUsesSimd) {
-            if ((x1 + 8) < x2) {
-                uint32_t len = (x2 - x1) >> 3;
-                rsdIntrinsicBlendDstIn_K(out, in, len);
-                x1 += len << 3;
-                out += len << 3;
-                in += len << 3;
-            }
-        }
-     #endif
-        for (;x1 < x2; x1++, out++, in++) {
-            ushort4 out_s = convert<ushort4>(*out);
-            out_s = (out_s * in->w) >> (ushort4)8;
-            *out = convert<uchar4>(out_s);
-        }
-        break;
-    case RenderScriptToolkit::BlendingMode::SRC_OUT:
-    #if defined(ARCH_X86_HAVE_SSSE3)
-        if (mUsesSimd) {
-            if ((x1 + 8) < x2) {
-                uint32_t len = (x2 - x1) >> 3;
-                rsdIntrinsicBlendSrcOut_K(out, in, len);
-                x1 += len << 3;
-                out += len << 3;
-                in += len << 3;
-            }
-        }
-    #endif
-        for (;x1 < x2; x1++, out++, in++) {
-            ushort4 in_s = convert<ushort4>(*in);
-            in_s = (in_s * (ushort4)(255 - out->w)) >> (ushort4)8;
-            *out = convert<uchar4>(in_s);
-        }
-        break;
-    case RenderScriptToolkit::BlendingMode::DST_OUT:
-    #if defined(ARCH_X86_HAVE_SSSE3)
-        if (mUsesSimd) {
-            if ((x1 + 8) < x2) {
-                uint32_t len = (x2 - x1) >> 3;
-                rsdIntrinsicBlendDstOut_K(out, in, len);
-                x1 += len << 3;
-                out += len << 3;
-                in += len << 3;
-            }
-        }
-    #endif
-        for (;x1 < x2; x1++, out++, in++) {
-            ushort4 out_s = convert<ushort4>(*out);
-            out_s = (out_s * (ushort4)(255 - in->w)) >> (ushort4)8;
-            *out = convert<uchar4>(out_s);
-        }
-        break;
-    case RenderScriptToolkit::BlendingMode::SRC_ATOP:
-    #if defined(ARCH_X86_HAVE_SSSE3)
-        if (mUsesSimd) {
-            if ((x1 + 8) < x2) {
-                uint32_t len = (x2 - x1) >> 3;
-                rsdIntrinsicBlendSrcAtop_K(out, in, len);
-                x1 += len << 3;
-                out += len << 3;
-                in += len << 3;
-            }
-        }
-    #endif
-        for (;x1 < x2; x1++, out++, in++) {
-            // The max value the operation could produce before the shift
-            // is 255 * 255 + 255 * (255 - 0) = 130050, or 0x1FC02.
-            // That value does not fit in a ushort, so we use uint.
-            uint4 in_s = convert<uint4>(*in);
-            uint4 out_s = convert<uint4>(*out);
-            out_s.xyz = ((in_s.xyz * out_s.w) +
-              (out_s.xyz * ((uint3)255 - (uint3)in_s.w))) >> (uint3)8;
-            *out = convertClipped(out_s);
-        }
-        break;
-    case RenderScriptToolkit::BlendingMode::DST_ATOP:
-    #if defined(ARCH_X86_HAVE_SSSE3)
-        if (mUsesSimd) {
-            if ((x1 + 8) < x2) {
-                uint32_t len = (x2 - x1) >> 3;
-                rsdIntrinsicBlendDstAtop_K(out, in, len);
-                x1 += len << 3;
-                out += len << 3;
-                in += len << 3;
-            }
-        }
-     #endif
-        for (;x1 < x2; x1++, out++, in++) {
-            uint4 in_s = convert<uint4>(*in);
-            uint4 out_s = convert<uint4>(*out);
-            out_s.xyz = ((out_s.xyz * in_s.w) +
-              (in_s.xyz * ((uint3)255 - (uint3)out_s.w))) >> (uint3)8;
-            out_s.w = in_s.w;
-            *out = convertClipped(out_s);
-        }
-        break;
-    case RenderScriptToolkit::BlendingMode::XOR:
-    #if defined(ARCH_X86_HAVE_SSSE3)
-        if (mUsesSimd) {
-            if ((x1 + 8) < x2) {
-                uint32_t len = (x2 - x1) >> 3;
-                rsdIntrinsicBlendXor_K(out, in, len);
-                x1 += len << 3;
-                out += len << 3;
-                in += len << 3;
-            }
-        }
-    #endif
-        for (;x1 < x2; x1++, out++, in++) {
-            *out = *in ^ *out;
-        }
-        break;
-    case RenderScriptToolkit::BlendingMode::MULTIPLY:
-    #if defined(ARCH_X86_HAVE_SSSE3)
-        if (mUsesSimd) {
-            if ((x1 + 8) < x2) {
-                uint32_t len = (x2 - x1) >> 3;
-                rsdIntrinsicBlendMultiply_K(out, in, len);
-                x1 += len << 3;
-                out += len << 3;
-                in += len << 3;
-            }
-        }
-    #endif
-        for (;x1 < x2; x1++, out++, in++) {
-          *out = convert<uchar4>((convert<ushort4>(*in) * convert<ushort4>(*out))
-                                >> (ushort4)8);
-        }
-        break;
-    case RenderScriptToolkit::BlendingMode::ADD:
-    #if defined(ARCH_X86_HAVE_SSSE3)
-        if (mUsesSimd) {
-            if((x1 + 8) < x2) {
-                uint32_t len = (x2 - x1) >> 3;
-                rsdIntrinsicBlendAdd_K(out, in, len);
-                x1 += len << 3;
-                out += len << 3;
-                in += len << 3;
-            }
-        }
-    #endif
-        for (;x1 < x2; x1++, out++, in++) {
-            uint32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
-                oR = out->x, oG = out->y, oB = out->z, oA = out->w;
-            out->x = (oR + iR) > 255 ? 255 : oR + iR;
-            out->y = (oG + iG) > 255 ? 255 : oG + iG;
-            out->z = (oB + iB) > 255 ? 255 : oB + iB;
-            out->w = (oA + iA) > 255 ? 255 : oA + iA;
-        }
-        break;
-    case RenderScriptToolkit::BlendingMode::SUBTRACT:
-    #if defined(ARCH_X86_HAVE_SSSE3)
-        if (mUsesSimd) {
-            if((x1 + 8) < x2) {
-                uint32_t len = (x2 - x1) >> 3;
-                rsdIntrinsicBlendSub_K(out, in, len);
-                x1 += len << 3;
-                out += len << 3;
-                in += len << 3;
-            }
-        }
-    #endif
-        for (;x1 < x2; x1++, out++, in++) {
-            int32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
-                oR = out->x, oG = out->y, oB = out->z, oA = out->w;
-            out->x = (oR - iR) < 0 ? 0 : oR - iR;
-            out->y = (oG - iG) < 0 ? 0 : oG - iG;
-            out->z = (oB - iB) < 0 ? 0 : oB - iB;
-            out->w = (oA - iA) < 0 ? 0 : oA - iA;
-        }
-        break;
-
-    default:
-        ALOGE("Called unimplemented value %d", mode);
-        assert(false);
-    }
-}
-
-void BlendTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
-                            size_t endY) {
-    for (size_t y = startY; y < endY; y++) {
-        size_t offset = y * mSizeX + startX;
-        blend(mMode, mIn + offset, mOut + offset, endX - startX);
-    }
-}
-
-void RenderScriptToolkit::blend(BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX,
-                                size_t sizeY, const Restriction* restriction) {
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
-    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
-        return;
-    }
-#endif
-
-    BlendTask task(mode, in, out, sizeX, sizeY, restriction);
-    processor->doTask(&task);
-}
-
-}  // namespace renderscript
-}  // namespace android

diff --git a/toolkit/Blend_advsimd.S b/toolkit/Blend_advsimd.S
deleted file mode 100644
index e5cb29b..0000000
--- a/toolkit/Blend_advsimd.S
+++ /dev/null

@@ -1,622 +0,0 @@
-/*
- * Copyright (C) 2013-2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
-#define END(f) .size f, .-f;
-
-#define BLEND_LIST(X) \
-    X(0, CLEAR) \
-    X(1, SRC) \
-    X(2, DST) \
-    X(3, SRC_OVER) \
-    X(4, DST_OVER) \
-    X(5, SRC_IN) \
-    X(6, DST_IN) \
-    X(7, SRC_OUT) \
-    X(8, DST_OUT) \
-    X(9, SRC_ATOP) \
-    X(10, DST_ATOP) \
-    X(11, XOR) \
-    X(12, MULTIPLY) \
-    X(13, ADD) \
-    X(14, SUBTRACT)
-
-/* This operation was not enabled in the original RenderScript. We could
- * enable it.
- *
- *  X(15, DIFFERENCE) \
- */
-
-/* For every blend operation supported, define a macro with just the arithmetic
- * component.  The rest can be handled later on.
- *
- * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
- * contain the data from the source buffer.  Both have already been split out
- * into one colour component per register (if necessary).  q3 and q11 contain
- * the alpha components.
- *
- * At the same time as defining the assembly macro, define a corresponding
- * preprocessor macro indicating any other requirements.
- *    zipped=0 -- The macro does not require the RGBA components to be
- *                separated.
- *    lddst=0  -- The macro does not require data from the destination buffer.
- *    ldsrc=0  -- The macro does not require data from the source buffer.
- *    nowrap=1 -- The macro requires no wrapper at all, and should simply be
- *                inserted without any surrounding load/store or loop code.
- */
-
-#define params_CLEAR zipped=0, lddst=0, ldsrc=0
-.macro blend_kernel_CLEAR
-        movi    v0.16b, #0
-        movi    v1.16b, #0
-        movi    v2.16b, #0
-        movi    v3.16b, #0
-.endm
-
-#define params_SRC zipped=0, lddst=0
-.macro blend_kernel_SRC
-        mov     v0.16b, v8.16b
-        mov     v1.16b, v9.16b
-        mov     v2.16b, v10.16b
-        mov     v3.16b, v11.16b
-.endm
-
-#define params_DST nowrap=1
-.macro blend_kernel_DST
-        /* nop */
-.endm
-
-#define params_SRC_OVER zipped=1
-.macro blend_kernel_SRC_OVER
-        mvn         v7.16b, v11.16b
-
-        umull2      v12.8h, v7.16b, v0.16b
-        umull       v0.8h,  v7.8b,  v0.8b
-        umull2      v13.8h, v7.16b, v1.16b
-        umull       v1.8h,  v7.8b,  v1.8b
-        umull2      v14.8h, v7.16b, v2.16b
-        umull       v2.8h,  v7.8b,  v2.8b
-        umull2      v15.8h, v7.16b, v3.16b
-        umull       v3.8h,  v7.8b,  v3.8b
-
-        rshrn       v4.8b,  v0.8h,  #8
-        rshrn2      v4.16b, v12.8h, #8
-        rshrn       v5.8b,  v1.8h,  #8
-        rshrn2      v5.16b, v13.8h, #8
-        rshrn       v6.8b,  v2.8h,  #8
-        rshrn2      v6.16b, v14.8h, #8
-        rshrn       v7.8b,  v3.8h,  #8
-        rshrn2      v7.16b, v15.8h, #8
-
-        uaddw       v0.8h,  v0.8h,  v4.8b
-        uaddw2      v12.8h, v12.8h, v4.16b
-        uaddw       v1.8h,  v1.8h,  v5.8b
-        uaddw2      v13.8h, v13.8h, v5.16b
-        uaddw       v2.8h,  v2.8h,  v6.8b
-        uaddw2      v14.8h, v14.8h, v6.16b
-        uaddw       v3.8h,  v3.8h,  v7.8b
-        uaddw2      v15.8h, v15.8h, v7.16b
-
-        rshrn       v0.8b,  v0.8h,  #8
-        rshrn2      v0.16b, v12.8h, #8
-        rshrn       v1.8b,  v1.8h,  #8
-        rshrn2      v1.16b, v13.8h, #8
-        rshrn       v2.8b,  v2.8h,  #8
-        rshrn2      v2.16b, v14.8h, #8
-        rshrn       v3.8b,  v3.8h,  #8
-        rshrn2      v3.16b, v15.8h, #8
-
-        uqadd       v0.16b, v0.16b, v8.16b
-        uqadd       v1.16b, v1.16b, v9.16b
-        uqadd       v2.16b, v2.16b, v10.16b
-        uqadd       v3.16b, v3.16b, v11.16b
-.endm
-
-#define params_DST_OVER zipped=1
-.macro blend_kernel_DST_OVER
-        mvn         v7.16b, v3.16b
-
-        umull2      v12.8h, v7.16b, v8.16b
-        umull       v8.8h,  v7.8b,  v8.8b
-        umull2      v13.8h, v7.16b, v9.16b
-        umull       v9.8h,  v7.8b,  v9.8b
-        umull2      v14.8h, v7.16b, v10.16b
-        umull       v10.8h, v7.8b,  v10.8b
-        umull2      v15.8h, v7.16b, v11.16b
-        umull       v11.8h, v7.8b,  v11.8b
-
-        rshrn       v4.8b,  v8.8h,  #8
-        rshrn2      v4.16b, v12.8h, #8
-        rshrn       v5.8b,  v9.8h,  #8
-        rshrn2      v5.16b, v13.8h, #8
-        rshrn       v6.8b,  v10.8h, #8
-        rshrn2      v6.16b, v14.8h, #8
-        rshrn       v7.8b,  v11.8h, #8
-        rshrn2      v7.16b, v15.8h, #8
-
-        uaddw       v8.8h,  v8.8h,  v4.8b
-        uaddw2      v12.8h, v12.8h, v4.16b
-        uaddw       v9.8h,  v9.8h,  v5.8b
-        uaddw2      v13.8h, v13.8h, v5.16b
-        uaddw       v10.8h, v10.8h, v6.8b
-        uaddw2      v14.8h, v14.8h, v6.16b
-        uaddw       v11.8h, v11.8h, v7.8b
-        uaddw2      v15.8h, v15.8h, v7.16b
-
-        rshrn       v8.8b,  v8.8h,  #8
-        rshrn2      v8.16b, v12.8h, #8
-        rshrn       v9.8b,  v9.8h,  #8
-        rshrn2      v9.16b, v13.8h, #8
-        rshrn       v10.8b,  v10.8h, #8
-        rshrn2      v10.16b, v14.8h, #8
-        rshrn       v11.8b,  v11.8h, #8
-        rshrn2      v11.16b, v15.8h, #8
-
-        uqadd       v0.16b, v0.16b, v8.16b
-        uqadd       v1.16b, v1.16b, v9.16b
-        uqadd       v2.16b, v2.16b, v10.16b
-        uqadd       v3.16b, v3.16b, v11.16b
-.endm
-
-#define params_SRC_IN zipped=1
-.macro blend_kernel_SRC_IN
-        umull2      v12.8h, v3.16b, v8.16b
-        umull       v0.8h,  v3.8b,  v8.8b
-        umull2      v13.8h, v3.16b, v9.16b
-        umull       v1.8h,  v3.8b,  v9.8b
-        umull2      v14.8h, v3.16b, v10.16b
-        umull       v2.8h,  v3.8b,  v10.8b
-        umull2      v15.8h, v3.16b, v11.16b
-        umull       v3.8h,  v3.8b,  v11.8b
-
-        rshrn       v4.8b,  v0.8h,  #8
-        rshrn2      v4.16b, v12.8h, #8
-        rshrn       v5.8b,  v1.8h,  #8
-        rshrn2      v5.16b, v13.8h, #8
-        rshrn       v6.8b,  v2.8h,  #8
-        rshrn2      v6.16b, v14.8h, #8
-        rshrn       v7.8b,  v3.8h,  #8
-        rshrn2      v7.16b, v15.8h, #8
-
-        uaddw       v0.8h,  v0.8h,  v4.8b
-        uaddw2      v12.8h, v12.8h, v4.16b
-        uaddw       v1.8h,  v1.8h,  v5.8b
-        uaddw2      v13.8h, v13.8h, v5.16b
-        uaddw       v2.8h,  v2.8h,  v6.8b
-        uaddw2      v14.8h, v14.8h, v6.16b
-        uaddw       v3.8h,  v3.8h,  v7.8b
-        uaddw2      v15.8h, v15.8h, v7.16b
-
-        rshrn       v0.8b,  v0.8h,  #8
-        rshrn2      v0.16b, v12.8h, #8
-        rshrn       v1.8b,  v1.8h,  #8
-        rshrn2      v1.16b, v13.8h, #8
-        rshrn       v2.8b,  v2.8h,  #8
-        rshrn2      v2.16b, v14.8h, #8
-        rshrn       v3.8b,  v3.8h,  #8
-        rshrn2      v3.16b, v15.8h, #8
-.endm
-
-#define params_DST_IN zipped=1
-.macro blend_kernel_DST_IN
-        umull2      v12.8h, v0.16b, v11.16b
-        umull       v0.8h,  v0.8b,  v11.8b
-        umull2      v13.8h, v1.16b, v11.16b
-        umull       v1.8h,  v1.8b,  v11.8b
-        umull2      v14.8h, v2.16b, v11.16b
-        umull       v2.8h,  v2.8b,  v11.8b
-        umull2      v15.8h, v3.16b, v11.16b
-        umull       v3.8h,  v3.8b,  v11.8b
-
-        rshrn       v4.8b,  v0.8h,  #8
-        rshrn2      v4.16b, v12.8h, #8
-        rshrn       v5.8b,  v1.8h,  #8
-        rshrn2      v5.16b, v13.8h, #8
-        rshrn       v6.8b,  v2.8h,  #8
-        rshrn2      v6.16b, v14.8h, #8
-        rshrn       v7.8b,  v3.8h,  #8
-        rshrn2      v7.16b, v15.8h, #8
-
-        uaddw       v0.8h,  v0.8h,  v4.8b
-        uaddw2      v12.8h, v12.8h, v4.16b
-        uaddw       v1.8h,  v1.8h,  v5.8b
-        uaddw2      v13.8h, v13.8h, v5.16b
-        uaddw       v2.8h,  v2.8h,  v6.8b
-        uaddw2      v14.8h, v14.8h, v6.16b
-        uaddw       v3.8h,  v3.8h,  v7.8b
-        uaddw2      v15.8h, v15.8h, v7.16b
-
-        rshrn       v0.8b,  v0.8h,  #8
-        rshrn2      v0.16b, v12.8h, #8
-        rshrn       v1.8b,  v1.8h,  #8
-        rshrn2      v1.16b, v13.8h, #8
-        rshrn       v2.8b,  v2.8h,  #8
-        rshrn2      v2.16b, v14.8h, #8
-        rshrn       v3.8b,  v3.8h,  #8
-        rshrn2      v3.16b, v15.8h, #8
-.endm
-
-#define params_SRC_OUT zipped=1
-.macro blend_kernel_SRC_OUT
-        mvn         v3.16b, v3.16b
-        blend_kernel_SRC_IN
-.endm
-
-
-#define params_DST_OUT zipped=1
-.macro blend_kernel_DST_OUT
-        mvn         v11.16b, v11.16b
-        blend_kernel_DST_IN
-.endm
-
-#define params_SRC_ATOP zipped=1
-.macro blend_kernel_SRC_ATOP
-        mvn         v11.16b, v11.16b
-
-        umull2      v12.8h, v11.16b, v0.16b
-        umull       v0.8h,  v11.8b,  v0.8b
-        umull2      v13.8h, v11.16b, v1.16b
-        umull       v1.8h,  v11.8b,  v1.8b
-        umull2      v14.8h, v11.16b, v2.16b
-        umull       v2.8h,  v11.8b,  v2.8b
-
-        umull2      v4.8h,  v3.16b, v8.16b
-        umull       v8.8h,  v3.8b,  v8.8b
-        umull2      v5.8h,  v3.16b, v9.16b
-        umull       v9.8h,  v3.8b,  v9.8b
-        umull2      v6.8h,  v3.16b, v10.16b
-        umull       v10.8h, v3.8b,  v10.8b
-
-        uqadd       v12.8h, v12.8h, v4.8h
-        uqadd       v0.8h,  v0.8h,  v8.8h
-        uqadd       v13.8h, v13.8h, v5.8h
-        uqadd       v1.8h,  v1.8h,  v9.8h
-        uqadd       v14.8h, v14.8h, v6.8h
-        uqadd       v2.8h,  v2.8h,  v10.8h
-
-        urshr       v8.8h,  v0.8h,  #8
-        urshr       v4.8h,  v12.8h, #8
-        urshr       v9.8h,  v1.8h,  #8
-        urshr       v5.8h,  v13.8h, #8
-        urshr       v10.8h, v2.8h,  #8
-        urshr       v6.8h,  v14.8h, #8
-
-        uqadd       v0.8h,  v0.8h,  v8.8h
-        uqadd       v12.8h, v12.8h, v4.8h
-        uqadd       v1.8h,  v1.8h,  v9.8h
-        uqadd       v13.8h, v13.8h, v5.8h
-        uqadd       v2.8h,  v2.8h,  v10.8h
-        uqadd       v14.8h, v14.8h, v6.8h
-
-        uqrshrn     v0.8b,  v0.8h,  #8
-        uqrshrn2    v0.16b, v12.8h, #8
-        uqrshrn     v1.8b,  v1.8h,  #8
-        uqrshrn2    v1.16b, v13.8h, #8
-        uqrshrn     v2.8b,  v2.8h,  #8
-        uqrshrn2    v2.16b, v14.8h, #8
-.endm
-
-#define params_DST_ATOP zipped=1
-.macro blend_kernel_DST_ATOP
-        mvn         v3.16b, v3.16b
-
-        umull2      v12.8h, v11.16b, v0.16b
-        umull       v0.8h,  v11.8b,  v0.8b
-        umull2      v13.8h, v11.16b, v1.16b
-        umull       v1.8h,  v11.8b,  v1.8b
-        umull2      v14.8h, v11.16b, v2.16b
-        umull       v2.8h,  v11.8b,  v2.8b
-
-        umull2      v4.8h,  v3.16b, v8.16b
-        umull       v8.8h,  v3.8b,  v8.8b
-        umull2      v5.8h,  v3.16b, v9.16b
-        umull       v9.8h,  v3.8b,  v9.8b
-        umull2      v6.8h,  v3.16b, v10.16b
-        umull       v10.8h, v3.8b,  v10.8b
-
-        uqadd       v12.8h, v12.8h, v4.8h
-        uqadd       v0.8h,  v0.8h,  v8.8h
-        uqadd       v13.8h, v13.8h, v5.8h
-        uqadd       v1.8h,  v1.8h,  v9.8h
-        uqadd       v14.8h, v14.8h, v6.8h
-        uqadd       v2.8h,  v2.8h,  v10.8h
-
-        urshr       v8.8h,  v0.8h,  #8
-        urshr       v4.8h,  v12.8h, #8
-        urshr       v9.8h,  v1.8h,  #8
-        urshr       v5.8h,  v13.8h, #8
-        urshr       v10.8h, v2.8h,  #8
-        urshr       v6.8h,  v14.8h, #8
-
-        uqadd       v0.8h,  v0.8h,  v8.8h
-        uqadd       v12.8h, v12.8h, v4.8h
-        uqadd       v1.8h,  v1.8h,  v9.8h
-        uqadd       v13.8h, v13.8h, v5.8h
-        uqadd       v2.8h,  v2.8h,  v10.8h
-        uqadd       v14.8h, v14.8h, v6.8h
-
-        uqrshrn     v0.8b,  v0.8h,  #8
-        uqrshrn2    v0.16b, v12.8h, #8
-        uqrshrn     v1.8b,  v1.8h,  #8
-        uqrshrn2    v1.16b, v13.8h, #8
-        uqrshrn     v2.8b,  v2.8h,  #8
-        uqrshrn2    v2.16b, v14.8h, #8
-
-        mov         v3.16b, v11.16b
-.endm
-
-#define params_MULTIPLY zipped=0
-.macro blend_kernel_MULTIPLY
-        umull2      v12.8h, v0.16b, v8.16b
-        umull       v0.8h,  v0.8b,  v8.8b
-        umull2      v13.8h, v1.16b, v9.16b
-        umull       v1.8h,  v1.8b,  v9.8b
-        umull2      v14.8h, v2.16b, v10.16b
-        umull       v2.8h,  v2.8b,  v10.8b
-        umull2      v15.8h, v3.16b, v11.16b
-        umull       v3.8h,  v3.8b,  v11.8b
-
-        rshrn       v4.8b,  v0.8h,  #8
-        rshrn2      v4.16b, v12.8h, #8
-        rshrn       v5.8b,  v1.8h,  #8
-        rshrn2      v5.16b, v13.8h, #8
-        rshrn       v6.8b,  v2.8h,  #8
-        rshrn2      v6.16b, v14.8h, #8
-        rshrn       v7.8b,  v3.8h,  #8
-        rshrn2      v7.16b, v15.8h, #8
-
-        uaddw       v0.8h,  v0.8h,  v4.8b
-        uaddw2      v12.8h, v12.8h, v4.16b
-        uaddw       v1.8h,  v1.8h,  v5.8b
-        uaddw2      v13.8h, v13.8h, v5.16b
-        uaddw       v2.8h,  v2.8h,  v6.8b
-        uaddw2      v14.8h, v14.8h, v6.16b
-        uaddw       v3.8h,  v3.8h,  v7.8b
-        uaddw2      v15.8h, v15.8h, v7.16b
-
-        rshrn       v0.8b,  v0.8h,  #8
-        rshrn2      v0.16b, v12.8h, #8
-        rshrn       v1.8b,  v1.8h,  #8
-        rshrn2      v1.16b, v13.8h, #8
-        rshrn       v2.8b,  v2.8h,  #8
-        rshrn2      v2.16b, v14.8h, #8
-        rshrn       v3.8b,  v3.8h,  #8
-        rshrn2      v3.16b, v15.8h, #8
-.endm
-
-#define params_ADD zipped=0
-.macro blend_kernel_ADD
-        uqadd    v0.16b, v0.16b, v8.16b
-        uqadd    v1.16b, v1.16b, v9.16b
-        uqadd    v2.16b, v2.16b, v10.16b
-        uqadd    v3.16b, v3.16b, v11.16b
-.endm
-
-#define params_SUBTRACT zipped=0
-.macro blend_kernel_SUBTRACT
-        uqsub    v0.16b, v0.16b, v8.16b
-        uqsub    v1.16b, v1.16b, v9.16b
-        uqsub    v2.16b, v2.16b, v10.16b
-        uqsub    v3.16b, v3.16b, v11.16b
-.endm
-
-#define params_DIFFERENCE zipped=0
-.macro blend_kernel_DIFFERENCE
-        uabd    v0.16b, v0.16b, v8.16b
-        uabd    v1.16b, v1.16b, v9.16b
-        uabd    v2.16b, v2.16b, v10.16b
-        uabd    v3.16b, v3.16b, v11.16b
-.endm
-
-#define params_XOR zipped=0
-.macro blend_kernel_XOR
-        eor     v0.16b, v0.16b, v8.16b
-        eor     v1.16b, v1.16b, v9.16b
-        eor     v2.16b, v2.16b, v10.16b
-        eor     v3.16b, v3.16b, v11.16b
-.endm
-
-
-/* Define the wrapper code which will load and store the data, iterate the
- * correct number of times, and safely handle the remainder at the end of the
- * loop.  Various sections of assembly code are dropped or substituted for
- * simpler operations if they're not needed.
- */
-.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
-.if \nowrap
-        \kernel
-.else
-        sub     x3, sp, #32
-        sub     sp, sp, #64
-        st1     {v8.1d - v11.1d}, [sp]
-        st1     {v12.1d - v15.1d}, [x3]
-        subs    x2, x2, #64
-        b       2f
-.align 4
-1:
-  .if \lddst
-    .if \zipped
-        ld4     {v0.16b - v3.16b}, [x0]
-    .else
-        ld1     {v0.16b - v3.16b}, [x0]
-    .endif
-  .endif
-  .if \ldsrc
-    .if \zipped
-        ld4     {v8.16b - v11.16b}, [x1], #64
-    .else
-        ld1     {v8.16b - v11.16b}, [x1], #64
-    .endif
-  .endif
-  .if \pld
-#if 0 /* TODO: test this on real hardware */
-    .if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif
-    .if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif
-#endif
-  .endif
-
-        \kernel
-
-        subs    x2, x2, #64
-  .if \zipped
-        st4     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
-  .else
-        st1     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
-  .endif
-
-2:      bge     1b
-        adds    x2, x2, #64
-        beq     2f
-
-        /* To handle the tail portion of the data (something less than 64
-         * bytes) load small power-of-two chunks into working registers.  It
-         * doesn't matter where they end up in the register; the same process
-         * will store them back out using the same positions and the operations
-         * don't require data to interact with its neighbours.
-         */
-        movi    v0.16b, #0
-        movi    v1.16b, #0
-        movi    v2.16b, #0
-        movi    v3.16b, #0
-
-        movi    v8.16b, #0
-        movi    v9.16b, #0
-        movi    v10.16b, #0
-        movi    v11.16b, #0
-
-        tbz     x2, #5, 1f
-  .if \lddst ; ld1     {v2.16b,v3.16b}, [x0], #32   ; .endif
-  .if \ldsrc ; ld1     {v10.16b,v11.16b}, [x1], #32 ; .endif
-1:      tbz     x2, #4, 1f
-  .if \lddst ; ld1     {v1.16b}, [x0], #16  ; .endif
-  .if \ldsrc ; ld1     {v9.16b}, [x1], #16  ; .endif
-1:      tbz     x2, #3, 1f
-  .if \lddst ; ld1     {v0.d}[1], [x0], #8 ; .endif
-  .if \ldsrc ; ld1     {v8.d}[1], [x1], #8 ; .endif
-1:      tbz     x2, #2, 1f
-  .if \lddst ; ld1     {v0.s}[1], [x0], #4 ; .endif
-  .if \ldsrc ; ld1     {v8.s}[1], [x1], #4 ; .endif
-1:      tbz     x2, #1, 1f
-  .if \lddst ; ld1     {v0.h}[1], [x0], #2 ; .endif
-  .if \ldsrc ; ld1     {v8.h}[1], [x1], #2 ; .endif
-1:      tbz     x2, #0, 1f
-  .if \lddst ; ld1     {v0.b}[1], [x0], #1 ; .endif
-  .if \ldsrc ; ld1     {v8.b}[1], [x1], #1 ; .endif
-1:
-  .if \lddst ; sub     x0, x0, x2           ; .endif
-
-.if \zipped
-        /* One small impediment in the process above is that some of the load
-         * operations can't perform byte-wise structure deinterleaving at the
-         * same time as loading only part of a register.  So the data is loaded
-         * linearly and unpacked manually at this point.
-         */
-        uzp1    v4.16b, v0.16b, v1.16b
-        uzp2    v5.16b, v0.16b, v1.16b
-        uzp1    v6.16b, v2.16b, v3.16b
-        uzp2    v7.16b, v2.16b, v3.16b
-        uzp1    v0.16b, v4.16b, v6.16b
-        uzp2    v2.16b, v4.16b, v6.16b
-        uzp1    v1.16b, v5.16b, v7.16b
-        uzp2    v3.16b, v5.16b, v7.16b
-
-        uzp1    v4.16b, v8.16b, v9.16b
-        uzp2    v5.16b, v8.16b, v9.16b
-        uzp1    v6.16b, v10.16b, v11.16b
-        uzp2    v7.16b, v10.16b, v11.16b
-        uzp1    v8.16b, v4.16b, v6.16b
-        uzp2    v10.16b, v4.16b, v6.16b
-        uzp1    v9.16b, v5.16b, v7.16b
-        uzp2    v11.16b, v5.16b, v7.16b
-
-        \kernel
-
-        zip1    v4.16b, v0.16b, v2.16b
-        zip2    v6.16b, v0.16b, v2.16b
-        zip1    v5.16b, v1.16b, v3.16b
-        zip2    v7.16b, v1.16b, v3.16b
-        zip1    v0.16b, v4.16b, v5.16b
-        zip2    v1.16b, v4.16b, v5.16b
-        zip1    v2.16b, v6.16b, v7.16b
-        zip2    v3.16b, v6.16b, v7.16b
-  .else
-        \kernel
-  .endif
-
-        tbz     x2, #5, 1f
-        st1     {v2.16b,v3.16b}, [x0], #32
-1:      tbz     x2, #4, 1f
-        st1     {v1.16b}, [x0], #16
-1:      tbz     x2, #3, 1f
-        st1     {v0.d}[1], [x0], #8
-1:      tbz     x2, #2, 1f
-        st1     {v0.s}[1], [x0], #4
-1:      tbz     x2, #1, 1f
-        st1     {v0.h}[1], [x0], #2
-1:      tbz     x2, #0, 2f
-        st1     {v0.b}[1], [x0], #1
-2:      ld1     {v8.1d - v11.1d}, [sp], #32
-        ld1     {v12.1d - v15.1d}, [sp], #32
-.endif
-        mov     x0, #0
-        ret
-.endm
-
-
-/* produce list of blend_line_XX() functions; each function uses the wrap_line
- * macro, passing it the name of the operation macro it wants along with
- * optional parameters to remove unnecessary operations.
- */
-#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
-    BLEND_LIST(BLEND_X)
-#undef BLEND_X
-
-#define BLEND_X(d, n) .set tablesize, d+1 ;
-    BLEND_LIST(BLEND_X)
-#undef BLEND_X
-
-/*  int rsdIntrinsicBlend_K(
- *          uchar4 *out,        // x0
- *          uchar4 const *in,   // x1
- *          int slot,           // x2
- *          size_t xstart,      // x3
- *          size_t xend);       // x4
- */
-ENTRY(rsdIntrinsicBlend_K)
-    adrp    x5, blendtable
-    add     x5, x5, :lo12:blendtable
-    cmp     w2, tablesize
-    bhs     1f
-    ldrsh   x6, [x5, w2, uxtw #1]
-    add     x0, x0, w3, uxtw #2
-    add     x1, x1, w3, uxtw #2
-    sub     w2, w4, w3
-    ubfiz   x2, x2, #2, #32 /* TODO: fix */
-    cbz     x6, 1f
-    adr     x5, 2f
-    add     x6, x5, x6
-2:  br      x6
-1:  mov     x0, #-1
-    ret
-
-END(rsdIntrinsicBlend_K)
-
-.rodata
-.set off,0
-blendtable:
-#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
-        BLEND_LIST(BLEND_X)
-#undef BLEND_X

diff --git a/toolkit/Blend_neon.S b/toolkit/Blend_neon.S
deleted file mode 100644
index a1fa1b5..0000000
--- a/toolkit/Blend_neon.S
+++ /dev/null

@@ -1,617 +0,0 @@
-/*
- * Copyright (C) 2013-2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
-#define END(f) .fnend; .size f, .-f;
-
-#define BLEND_LIST(X) \
-    X(0, CLEAR) \
-    X(1, SRC) \
-    X(2, DST) \
-    X(3, SRC_OVER) \
-    X(4, DST_OVER) \
-    X(5, SRC_IN) \
-    X(6, DST_IN) \
-    X(7, SRC_OUT) \
-    X(8, DST_OUT) \
-    X(9, SRC_ATOP) \
-    X(10, DST_ATOP) \
-    X(11, XOR) \
-    X(14, MULTIPLY) \
-    X(21, DIFFERENCE) \
-    X(34, ADD) \
-    X(35, SUBTRACT)
-
-.eabi_attribute 25,1 @Tag_ABI_align8_preserved
-.arm
-
-/* For every blend operation supported, define a macro with just the arithmetic
- * component.  The rest can be handled later on.
- *
- * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
- * contain the data from the source buffer.  Both have already been split out
- * into one colour component per register (if necessary).  q3 and q11 contain
- * the alpha components.
- *
- * At the same time as defining the assembly macro, define a corresponding
- * preprocessor macro indicating any other requirements.
- *    zipped=0 -- The macro does not require the RGBA components to be
- *                separated.
- *    lddst=0  -- The macro does not require data from the destination buffer.
- *    ldsrc=0  -- The macro does not require data from the source buffer.
- *    nowrap=1 -- The macro requires no wrapper at all, and should simply be
- *                inserted without any surrounding load/store or loop code.
- */
-
-#define params_CLEAR zipped=0, lddst=0, ldsrc=0
-.macro blend_kernel_CLEAR
-        vmov.i8 q0, #0
-        vmov.i8 q1, #0
-        vmov.i8 q2, #0
-        vmov.i8 q3, #0
-.endm
-
-#define params_SRC zipped=0, lddst=0
-.macro blend_kernel_SRC
-        vmov    q0, q8
-        vmov    q1, q9
-        vmov    q2, q10
-        vmov    q3, q11
-.endm
-
-#define params_DST nowrap=1
-.macro blend_kernel_DST
-        /* nop */
-.endm
-
-#define params_SRC_OVER zipped=1
-.macro blend_kernel_SRC_OVER
-        vmvn        q7, q11
-
-        vmull.u8    q12, d15, d1
-        vmull.u8    q0,  d14, d0
-        vmull.u8    q13, d15, d3
-        vmull.u8    q1,  d14, d2
-        vmull.u8    q14, d15, d5
-        vmull.u8    q2,  d14, d4
-        vmull.u8    q15, d15, d7
-        vmull.u8    q3,  d14, d6
-
-        vrshrn.u16  d8,  q0,  #8
-        vrshrn.u16  d9,  q12, #8
-        vrshrn.u16  d10, q1,  #8
-        vrshrn.u16  d11, q13, #8
-        vrshrn.u16  d12, q2,  #8
-        vrshrn.u16  d13, q14, #8
-        vrshrn.u16  d14, q3,  #8
-        vrshrn.u16  d15, q15, #8
-
-        vaddw.u8    q0,  d8
-        vaddw.u8    q12, d9
-        vaddw.u8    q1,  d10
-        vaddw.u8    q13, d11
-        vaddw.u8    q2,  d12
-        vaddw.u8    q14, d13
-        vaddw.u8    q3,  d14
-        vaddw.u8    q15, d15
-
-        vrshrn.u16  d0, q0,  #8
-        vrshrn.u16  d1, q12, #8
-        vrshrn.u16  d2, q1,  #8
-        vrshrn.u16  d3, q13, #8
-        vrshrn.u16  d4, q2,  #8
-        vrshrn.u16  d5, q14, #8
-        vrshrn.u16  d6, q3,  #8
-        vrshrn.u16  d7, q15, #8
-
-        vqadd.u8    q0, q8
-        vqadd.u8    q1, q9
-        vqadd.u8    q2, q10
-        vqadd.u8    q3, q11
-.endm
-
-#define params_DST_OVER zipped=1
-.macro blend_kernel_DST_OVER
-        vmvn        q7, q3
-
-        vmull.u8    q12, d15, d17
-        vmull.u8    q8,  d14, d16
-        vmull.u8    q13, d15, d19
-        vmull.u8    q9,  d14, d18
-        vmull.u8    q14, d15, d21
-        vmull.u8    q10, d14, d20
-        vmull.u8    q15, d15, d23
-        vmull.u8    q11, d14, d22
-
-        vrshrn.u16  d8,  q0,  #8
-        vrshrn.u16  d9,  q12, #8
-        vrshrn.u16  d10, q1,  #8
-        vrshrn.u16  d11, q13, #8
-        vrshrn.u16  d12, q2,  #8
-        vrshrn.u16  d13, q14, #8
-        vrshrn.u16  d14, q3,  #8
-        vrshrn.u16  d15, q15, #8
-
-        vaddw.u8    q8,  d8
-        vaddw.u8    q12, d9
-        vaddw.u8    q9,  d10
-        vaddw.u8    q13, d11
-        vaddw.u8    q10, d12
-        vaddw.u8    q14, d13
-        vaddw.u8    q11, d14
-        vaddw.u8    q15, d15
-
-        vrshrn.u16  d16, q8,  #8
-        vrshrn.u16  d17, q12, #8
-        vrshrn.u16  d18, q9,  #8
-        vrshrn.u16  d19, q13, #8
-        vrshrn.u16  d20, q10, #8
-        vrshrn.u16  d21, q14, #8
-        vrshrn.u16  d22, q11, #8
-        vrshrn.u16  d23, q15, #8
-
-        vqadd.u8    q0, q8
-        vqadd.u8    q1, q9
-        vqadd.u8    q2, q10
-        vqadd.u8    q3, q11
-.endm
-
-#define params_SRC_IN zipped=1
-.macro blend_kernel_SRC_IN
-        vmull.u8    q12, d7, d17
-        vmull.u8    q0,  d6, d16
-        vmull.u8    q13, d7, d19
-        vmull.u8    q1,  d6, d18
-        vmull.u8    q14, d7, d21
-        vmull.u8    q2,  d6, d20
-        vmull.u8    q15, d7, d23
-        vmull.u8    q3,  d6, d22
-
-        vrshrn.u16  d8,  q0,  #8
-        vrshrn.u16  d9,  q12, #8
-        vrshrn.u16  d10, q1,  #8
-        vrshrn.u16  d11, q13, #8
-        vrshrn.u16  d12, q2,  #8
-        vrshrn.u16  d13, q14, #8
-        vrshrn.u16  d14, q3,  #8
-        vrshrn.u16  d15, q15, #8
-
-        vaddw.u8    q0,  d8
-        vaddw.u8    q12, d9
-        vaddw.u8    q1,  d10
-        vaddw.u8    q13, d11
-        vaddw.u8    q2,  d12
-        vaddw.u8    q14, d13
-        vaddw.u8    q3,  d14
-        vaddw.u8    q15, d15
-
-        vrshrn.u16  d0, q0,  #8
-        vrshrn.u16  d1, q12, #8
-        vrshrn.u16  d2, q1,  #8
-        vrshrn.u16  d3, q13, #8
-        vrshrn.u16  d4, q2,  #8
-        vrshrn.u16  d5, q14, #8
-        vrshrn.u16  d6, q3,  #8
-        vrshrn.u16  d7, q15, #8
-.endm
-
-#define params_DST_IN zipped=1
-.macro blend_kernel_DST_IN
-        vmull.u8    q12, d1, d23
-        vmull.u8    q0,  d0, d22
-        vmull.u8    q13, d3, d23
-        vmull.u8    q1,  d2, d22
-        vmull.u8    q14, d5, d23
-        vmull.u8    q2,  d4, d22
-        vmull.u8    q15, d7, d23
-        vmull.u8    q3,  d6, d22
-
-        vrshrn.u16  d8,  q0,  #8
-        vrshrn.u16  d9,  q12, #8
-        vrshrn.u16  d10, q1,  #8
-        vrshrn.u16  d11, q13, #8
-        vrshrn.u16  d12, q2,  #8
-        vrshrn.u16  d13, q14, #8
-        vrshrn.u16  d14, q3,  #8
-        vrshrn.u16  d15, q15, #8
-
-        vaddw.u8    q0,  d8
-        vaddw.u8    q12, d9
-        vaddw.u8    q1,  d10
-        vaddw.u8    q13, d11
-        vaddw.u8    q2,  d12
-        vaddw.u8    q14, d13
-        vaddw.u8    q3,  d14
-        vaddw.u8    q15, d15
-
-        vrshrn.u16  d0, q0,  #8
-        vrshrn.u16  d1, q12, #8
-        vrshrn.u16  d2, q1,  #8
-        vrshrn.u16  d3, q13, #8
-        vrshrn.u16  d4, q2,  #8
-        vrshrn.u16  d5, q14, #8
-        vrshrn.u16  d6, q3,  #8
-        vrshrn.u16  d7, q15, #8
-.endm
-
-#define params_SRC_OUT zipped=1
-.macro blend_kernel_SRC_OUT
-        vmvn        q3, q3
-        blend_kernel_SRC_IN
-.endm
-
-
-#define params_DST_OUT zipped=1
-.macro blend_kernel_DST_OUT
-        vmvn        q11, q11
-        blend_kernel_DST_IN
-.endm
-
-#define params_SRC_ATOP zipped=1
-.macro blend_kernel_SRC_ATOP
-        vmvn        q11, q11
-
-        vmull.u8    q12, d23, d1
-        vmull.u8    q0,  d22, d0
-        vmull.u8    q13, d23, d3
-        vmull.u8    q1,  d22, d2
-        vmull.u8    q14, d23, d5
-        vmull.u8    q2,  d22, d4
-
-        vmull.u8    q4,  d7, d17
-        vmull.u8    q8,  d6, d16
-        vmull.u8    q5,  d7, d19
-        vmull.u8    q9,  d6, d18
-        vmull.u8    q6,  d7, d21
-        vmull.u8    q10, d6, d20
-
-        vqadd.u16   q12, q4
-        vqadd.u16   q0,  q8
-        vqadd.u16   q13, q5
-        vqadd.u16   q1,  q9
-        vqadd.u16   q14, q6
-        vqadd.u16   q2,  q10
-
-        vrshr.u16   q8,  q0,  #8
-        vrshr.u16   q4,  q12, #8
-        vrshr.u16   q9,  q1,  #8
-        vrshr.u16   q5,  q13, #8
-        vrshr.u16   q10, q2,  #8
-        vrshr.u16   q6,  q14, #8
-
-        vqadd.u16   q0,  q8
-        vqadd.u16   q12, q4
-        vqadd.u16   q1,  q9
-        vqadd.u16   q13, q5
-        vqadd.u16   q2,  q10
-        vqadd.u16   q14, q6
-
-        vqrshrn.u16 d0, q0,  #8
-        vqrshrn.u16 d1, q12, #8
-        vqrshrn.u16 d2, q1,  #8
-        vqrshrn.u16 d3, q13, #8
-        vqrshrn.u16 d4, q2,  #8
-        vqrshrn.u16 d5, q14, #8
-.endm
-
-#define params_DST_ATOP zipped=1
-.macro blend_kernel_DST_ATOP
-        vmvn        q3, q3
-
-        vmull.u8    q12, d23, d1
-        vmull.u8    q0,  d22, d0
-        vmull.u8    q13, d23, d3
-        vmull.u8    q1,  d22, d2
-        vmull.u8    q14, d23, d5
-        vmull.u8    q2,  d22, d4
-
-        vmull.u8    q4,  d7, d17
-        vmull.u8    q8,  d6, d16
-        vmull.u8    q5,  d7, d19
-        vmull.u8    q9,  d6, d18
-        vmull.u8    q6,  d7, d21
-        vmull.u8    q10, d6, d20
-
-        vqadd.u16   q12, q4
-        vqadd.u16   q0,  q8
-        vqadd.u16   q13, q5
-        vqadd.u16   q1,  q9
-        vqadd.u16   q14, q6
-        vqadd.u16   q2,  q10
-
-        vrshr.u16   q8,  q0,  #8
-        vrshr.u16   q4,  q12, #8
-        vrshr.u16   q9,  q1,  #8
-        vrshr.u16   q5,  q13, #8
-        vrshr.u16   q10, q2,  #8
-        vrshr.u16   q6,  q14, #8
-
-        vqadd.u16   q0,  q8
-        vqadd.u16   q12, q4
-        vqadd.u16   q1,  q9
-        vqadd.u16   q13, q5
-        vqadd.u16   q2,  q10
-        vqadd.u16   q14, q6
-
-        vqrshrn.u16 d0, q0,  #8
-        vqrshrn.u16 d1, q12, #8
-        vqrshrn.u16 d2, q1,  #8
-        vqrshrn.u16 d3, q13, #8
-        vqrshrn.u16 d4, q2,  #8
-        vqrshrn.u16 d5, q14, #8
-
-        vmov        q3, q11
-.endm
-
-#define params_MULTIPLY zipped=0
-.macro blend_kernel_MULTIPLY
-        vmull.u8    q12, d1, d17
-        vmull.u8    q0,  d0, d16
-        vmull.u8    q13, d3, d19
-        vmull.u8    q1,  d2, d18
-        vmull.u8    q14, d5, d21
-        vmull.u8    q2,  d4, d20
-        vmull.u8    q15, d7, d23
-        vmull.u8    q3,  d6, d22
-
-        vrshrn.u16  d8,  q0,  #8
-        vrshrn.u16  d9,  q12, #8
-        vrshrn.u16  d10, q1,  #8
-        vrshrn.u16  d11, q13, #8
-        vrshrn.u16  d12, q2,  #8
-        vrshrn.u16  d13, q14, #8
-        vrshrn.u16  d14, q3,  #8
-        vrshrn.u16  d15, q15, #8
-
-        vaddw.u8    q0,  d8
-        vaddw.u8    q12, d9
-        vaddw.u8    q1,  d10
-        vaddw.u8    q13, d11
-        vaddw.u8    q2,  d12
-        vaddw.u8    q14, d13
-        vaddw.u8    q3,  d14
-        vaddw.u8    q15, d15
-
-        vrshrn.u16  d0, q0,  #8
-        vrshrn.u16  d1, q12, #8
-        vrshrn.u16  d2, q1,  #8
-        vrshrn.u16  d3, q13, #8
-        vrshrn.u16  d4, q2,  #8
-        vrshrn.u16  d5, q14, #8
-        vrshrn.u16  d6, q3,  #8
-        vrshrn.u16  d7, q15, #8
-.endm
-
-#define params_ADD zipped=0
-.macro blend_kernel_ADD
-        vqadd.u8 q0, q0, q8
-        vqadd.u8 q1, q1, q9
-        vqadd.u8 q2, q2, q10
-        vqadd.u8 q3, q3, q11
-.endm
-
-#define params_SUBTRACT zipped=0
-.macro blend_kernel_SUBTRACT
-        vqsub.u8 q0, q0, q8
-        vqsub.u8 q1, q1, q9
-        vqsub.u8 q2, q2, q10
-        vqsub.u8 q3, q3, q11
-.endm
-
-#define params_DIFFERENCE zipped=0
-.macro blend_kernel_DIFFERENCE
-        vabd.u8 q0, q0, q8
-        vabd.u8 q1, q1, q9
-        vabd.u8 q2, q2, q10
-        vabd.u8 q3, q3, q11
-.endm
-
-#define params_XOR zipped=0
-.macro blend_kernel_XOR
-        veor    q0, q0, q8
-        veor    q1, q1, q9
-        veor    q2, q2, q10
-        veor    q3, q3, q11
-.endm
-
-
-/* Define the wrapper code which will load and store the data, iterate the
- * correct number of times, and safely handle the remainder at the end of the
- * loop.  Various sections of assembly code are dropped or substituted for
- * simpler operations if they're not needed.
- */
-.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
-.if \nowrap
-        \kernel
-.else
-        vpush   {d8-d15}
-        subs    r2, #64
-        b       2f
-        .align 4
-1:
-  .if \lddst
-    .if \zipped
-        vld4.8  {d0,d2,d4,d6}, [r0]!
-        vld4.8  {d1,d3,d5,d7}, [r0]!
-    .else
-        vld1.8  {d0-d3}, [r0]!
-        vld1.8  {d4-d7}, [r0]!
-    .endif
-        sub     r0, #64
-  .endif
-  .if \ldsrc
-    .if \zipped
-        vld4.8  {d16,d18,d20,d22}, [r1]!
-        vld4.8  {d17,d19,d21,d23}, [r1]!
-    .else
-        vld1.8  {d16-d19}, [r1]!
-        vld1.8  {d20-d23}, [r1]!
-    .endif
-  .endif
-  .if \pld
-    .if \lddst ; pld [r0, #192] ; .endif
-    .if \ldsrc ; pld [r1, #192] ; .endif
-  .endif
-
-        \kernel
-
-        subs    r2, #64
-  .if \zipped
-        vst4.8  {d0,d2,d4,d6}, [r0]!
-        vst4.8  {d1,d3,d5,d7}, [r0]!
-  .else
-        vst1.8  {d0-d3}, [r0]!
-        vst1.8  {d4-d7}, [r0]!
-  .endif
-
-2:      bge     1b
-        adds    r2, #64
-        beq     2f
-
-        /* To handle the tail portion of the data (something less than 64
-         * bytes) load small power-of-two chunks into working registers.  It
-         * doesn't matter where they end up in the register; the same process
-         * will store them back out using the same positions and the operations
-         * don't require data to interact with its neighbours.
-         */
-        vmov.i8 q0, #0
-        vmov.i8 q1, #0
-        vmov.i8 q2, #0
-        vmov.i8 q3, #0
-
-        vmov.i8 q8, #0
-        vmov.i8 q9, #0
-        vmov.i8 q10, #0
-        vmov.i8 q11, #0
-
-        tst     r2, #32
-        beq     1f
-  .if \lddst ; vld1.64 {d4-d7}, [r0]!   ; .endif
-  .if \ldsrc ; vld1.64 {d20-d23}, [r1]! ; .endif
-1:      tst     r2, #16
-        beq     1f
-  .if \lddst ; vld1.64 {d2-d3}, [r0]!   ; .endif
-  .if \ldsrc ; vld1.64 {d18-d19}, [r1]! ; .endif
-1:      tst     r2, #8
-        beq     1f
-  .if \lddst ; vld1.64 {d1}, [r0]!      ; .endif
-  .if \ldsrc ; vld1.64 {d17}, [r1]!     ; .endif
-1:      tst     r2, #4
-        beq     1f
-  .if \lddst ; vld1.32 {d0[1]}, [r0]!   ; .endif
-  .if \ldsrc ; vld1.32 {d16[1]}, [r1]!  ; .endif
-1:      tst     r2, #2
-        beq     1f
-  .if \lddst ; vld1.16 {d0[1]}, [r0]!   ; .endif
-  .if \ldsrc ; vld1.16 {d16[1]}, [r1]!  ; .endif
-1:      tst     r2, #1
-        beq     1f
-  .if \lddst ; vld1.8  {d0[1]}, [r0]!   ; .endif
-  .if \ldsrc ; vld1.8  {d16[1]}, [r1]!  ; .endif
-1:
-  .if \lddst ; sub     r0, r2           ; .endif
-
-  .if \zipped
-        /* One small impediment in the process above is that some of the load
-         * operations can't perform byte-wise structure deinterleaving at the
-         * same time as loading only part of a register.  So the data is loaded
-         * linearly and unpacked manually at this point.
-         */
-        vuzp.8  q0, q1
-        vuzp.8  q2, q3
-        vuzp.8  q0, q2
-        vuzp.8  q1, q3
-
-        vuzp.8  q8, q9
-        vuzp.8  q10, q11
-        vuzp.8  q8, q10
-        vuzp.8  q9, q11
-
-        \kernel
-
-        vzip.8  q0, q2
-        vzip.8  q1, q3
-        vzip.8  q0, q1
-        vzip.8  q2, q3
-  .else
-        \kernel
-  .endif
-
-        tst     r2, #32
-        beq     1f
-        vst1.64 {d4-d7}, [r0]!
-1:      tst     r2, #16
-        beq     1f
-        vst1.64 {d2-d3}, [r0]!
-1:      tst     r2, #8
-        beq     1f
-        vst1.64 {d1}, [r0]!
-1:      tst     r2, #4
-        beq     1f
-        vst1.32 {d0[1]}, [r0]!
-1:      tst     r2, #2
-        beq     1f
-        vst1.16 {d0[1]}, [r0]!
-1:      tst     r2, #1
-        beq     2f
-        vst1.8  {d0[1]}, [r0]!
-2:      vpop    {d8-d15}
-.endif
-        mov     r0, #0
-        bx      lr
-.endm
-
-
-/* produce list of blend_line_XX() functions; each function uses the wrap_line
- * macro, passing it the name of the operation macro it wants along with
- * optional parameters to remove unnecessary operations.
- */
-#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
-    BLEND_LIST(BLEND_X)
-#undef BLEND_X
-
-
-/*  int rsdIntrinsicBlend_K(
- *          uchar4 *out,        // r0
- *          uchar4 const *in,   // r1
- *          int slot,           // r2
- *          size_t xstart,      // r3
- *          size_t xend);       // [sp]
- */
-ENTRY(rsdIntrinsicBlend_K)
-    adr     ip, blend_functions
-    cmp     r2, #(blend_functions_end - blend_functions) >> 2
-    ldrlo   ip, [ip, r2, LSL #2]
-    movhs   ip, #0
-    ldr     r2, [sp]
-    add     r0, r3, LSL #2
-    add     r1, r3, LSL #2
-    sub     r2, r3
-    mov     r2, r2, LSL #2
-    cmp     ip, #0
-    addne   ip, ip, pc
-    bxne    ip
-1:  mov     r0, #-1
-    bx      lr
-
-blend_functions:
-.set off,0
-#define BLEND_X(d, n) .rept d-off ; .word 0 ; .endr ; .word blend_line_##n-1b ; .set off, d+1 ;
-        BLEND_LIST(BLEND_X)
-#undef BLEND_X
-blend_functions_end:
-
-END(rsdIntrinsicBlend_K)

diff --git a/toolkit/Blur.cpp b/toolkit/Blur.cpp
deleted file mode 100644
index a95ff43..0000000
--- a/toolkit/Blur.cpp
+++ /dev/null

@@ -1,545 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <math.h>
-
-#include <cstdint>
-
-#include "RenderScriptToolkit.h"
-#include "TaskProcessor.h"
-#include "Utils.h"
-
-namespace android {
-namespace renderscript {
-
-#define LOG_TAG "renderscript.toolkit.Blur"
-
-/**
- * Blurs an image or a section of an image.
- *
- * Our algorithm does two passes: a vertical blur followed by an horizontal blur.
- */
-class BlurTask : public Task {
-    // The image we're blurring.
-    const uchar* mIn;
-    // Where we store the blurred image.
-    uchar* outArray;
-    // The size of the kernel radius is limited to 25 in ScriptIntrinsicBlur.java.
-    // So, the max kernel size is 51 (= 2 * 25 + 1).
-    // Considering SSSE3 case, which requires the size is multiple of 4,
-    // at least 52 words are necessary. Values outside of the kernel should be 0.
-    float mFp[104];
-    uint16_t mIp[104];
-
-    // Working area to store the result of the vertical blur, to be used by the horizontal pass.
-    // There's one area per thread. Since the needed working area may be too large to put on the
-    // stack, we are allocating it from the heap. To avoid paying the allocation cost for each
-    // tile, we cache the scratch area here.
-    std::vector<void*> mScratch;       // Pointers to the scratch areas, one per thread.
-    std::vector<size_t> mScratchSize;  // The size in bytes of the scratch areas, one per thread.
-
-    // The radius of the blur, in floating point and integer format.
-    float mRadius;
-    int mIradius;
-
-    void kernelU4(void* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
-                  uint32_t threadIndex);
-    void kernelU1(void* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
-    void ComputeGaussianWeights();
-
-    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
-    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                             size_t endY) override;
-
-   public:
-    BlurTask(const uint8_t* in, uint8_t* out, size_t sizeX, size_t sizeY, size_t vectorSize,
-             uint32_t threadCount, float radius, const Restriction* restriction)
-        : Task{sizeX, sizeY, vectorSize, false, restriction},
-          mIn{in},
-          outArray{out},
-          mScratch{threadCount},
-          mScratchSize{threadCount},
-          mRadius{std::min(25.0f, radius)} {
-        ComputeGaussianWeights();
-    }
-
-    ~BlurTask() {
-        for (size_t i = 0; i < mScratch.size(); i++) {
-            if (mScratch[i]) {
-                free(mScratch[i]);
-            }
-        }
-    }
-};
-
-void BlurTask::ComputeGaussianWeights() {
-    memset(mFp, 0, sizeof(mFp));
-    memset(mIp, 0, sizeof(mIp));
-
-    // Compute gaussian weights for the blur
-    // e is the euler's number
-    float e = 2.718281828459045f;
-    float pi = 3.1415926535897932f;
-    // g(x) = (1 / (sqrt(2 * pi) * sigma)) * e ^ (-x^2 / (2 * sigma^2))
-    // x is of the form [-radius .. 0 .. radius]
-    // and sigma varies with the radius.
-    // Based on some experimental radius values and sigmas,
-    // we approximately fit sigma = f(radius) as
-    // sigma = radius * 0.4  + 0.6
-    // The larger the radius gets, the more our gaussian blur
-    // will resemble a box blur since with large sigma
-    // the gaussian curve begins to lose its shape
-    float sigma = 0.4f * mRadius + 0.6f;
-
-    // Now compute the coefficients. We will store some redundant values to save
-    // some math during the blur calculations precompute some values
-    float coeff1 = 1.0f / (sqrtf(2.0f * pi) * sigma);
-    float coeff2 = - 1.0f / (2.0f * sigma * sigma);
-
-    float normalizeFactor = 0.0f;
-    float floatR = 0.0f;
-    int r;
-    mIradius = (float)ceil(mRadius) + 0.5f;
-    for (r = -mIradius; r <= mIradius; r ++) {
-        floatR = (float)r;
-        mFp[r + mIradius] = coeff1 * powf(e, floatR * floatR * coeff2);
-        normalizeFactor += mFp[r + mIradius];
-    }
-
-    // Now we need to normalize the weights because all our coefficients need to add up to one
-    normalizeFactor = 1.0f / normalizeFactor;
-    for (r = -mIradius; r <= mIradius; r ++) {
-        mFp[r + mIradius] *= normalizeFactor;
-        mIp[r + mIradius] = (uint16_t)(mFp[r + mIradius] * 65536.0f + 0.5f);
-    }
-}
-
-/**
- * Vertical blur of a uchar4 line.
- *
- * @param sizeY Number of cells of the input array in the vertical direction.
- * @param out Where to place the computed value.
- * @param x Coordinate of the point we're blurring.
- * @param y Coordinate of the point we're blurring.
- * @param ptrIn Start of the input array.
- * @param iStride The size in byte of a row of the input array.
- * @param gPtr The gaussian coefficients.
- * @param iradius The radius of the blur.
- */
-static void OneVU4(uint32_t sizeY, float4* out, int32_t x, int32_t y, const uchar* ptrIn,
-                   int iStride, const float* gPtr, int iradius) {
-    const uchar *pi = ptrIn + x*4;
-
-    float4 blurredPixel = 0;
-    for (int r = -iradius; r <= iradius; r ++) {
-        int validY = std::max((y + r), 0);
-        validY = std::min(validY, (int)(sizeY - 1));
-        const uchar4 *pvy = (const uchar4 *)&pi[validY * iStride];
-        float4 pf = convert<float4>(pvy[0]);
-        blurredPixel += pf * gPtr[0];
-        gPtr++;
-    }
-
-    out[0] = blurredPixel;
-}
-
-/**
- * Vertical blur of a uchar1 line.
- *
- * @param sizeY Number of cells of the input array in the vertical direction.
- * @param out Where to place the computed value.
- * @param x Coordinate of the point we're blurring.
- * @param y Coordinate of the point we're blurring.
- * @param ptrIn Start of the input array.
- * @param iStride The size in byte of a row of the input array.
- * @param gPtr The gaussian coefficients.
- * @param iradius The radius of the blur.
- */
-static void OneVU1(uint32_t sizeY, float *out, int32_t x, int32_t y,
-                   const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
-
-    const uchar *pi = ptrIn + x;
-
-    float blurredPixel = 0;
-    for (int r = -iradius; r <= iradius; r ++) {
-        int validY = std::max((y + r), 0);
-        validY = std::min(validY, (int)(sizeY - 1));
-        float pf = (float)pi[validY * iStride];
-        blurredPixel += pf * gPtr[0];
-        gPtr++;
-    }
-
-    out[0] = blurredPixel;
-}
-
-
-extern "C" void rsdIntrinsicBlurU1_K(uchar *out, uchar const *in, size_t w, size_t h,
-                 size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
-extern "C" void rsdIntrinsicBlurU4_K(uchar4 *out, uchar4 const *in, size_t w, size_t h,
-                 size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
-
-#if defined(ARCH_X86_HAVE_SSSE3)
-extern void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr,
-                                   int rct, int x1, int ct);
-extern void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1,
-                                   int ct);
-extern void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1,
-                                   int ct);
-#endif
-
-/**
- * Vertical blur of a line of RGBA, knowing that there's enough rows above and below us to avoid
- * dealing with boundary conditions.
- *
- * @param out Where to store the results. This is the input to the horizontal blur.
- * @param ptrIn The input data for this line.
- * @param iStride The width of the input.
- * @param gPtr The gaussian coefficients.
- * @param ct The diameter of the blur.
- * @param len How many cells to blur.
- * @param usesSimd Whether this processor supports SIMD.
- */
-static void OneVFU4(float4 *out, const uchar *ptrIn, int iStride, const float* gPtr, int ct,
-                    int x2, bool usesSimd) {
-    int x1 = 0;
-#if defined(ARCH_X86_HAVE_SSSE3)
-    if (usesSimd) {
-        int t = (x2 - x1);
-        t &= ~1;
-        if (t) {
-            rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
-        }
-        x1 += t;
-        out += t;
-        ptrIn += t << 2;
-    }
-#else
-    (void) usesSimd; // Avoid unused parameter warning.
-#endif
-    while(x2 > x1) {
-        const uchar *pi = ptrIn;
-        float4 blurredPixel = 0;
-        const float* gp = gPtr;
-
-        for (int r = 0; r < ct; r++) {
-            float4 pf = convert<float4>(((const uchar4 *)pi)[0]);
-            blurredPixel += pf * gp[0];
-            pi += iStride;
-            gp++;
-        }
-        out->xyzw = blurredPixel;
-        x1++;
-        out++;
-        ptrIn+=4;
-    }
-}
-
-/**
- * Vertical blur of a line of U_8, knowing that there's enough rows above and below us to avoid
- * dealing with boundary conditions.
- *
- * @param out Where to store the results. This is the input to the horizontal blur.
- * @param ptrIn The input data for this line.
- * @param iStride The width of the input.
- * @param gPtr The gaussian coefficients.
- * @param ct The diameter of the blur.
- * @param len How many cells to blur.
- * @param usesSimd Whether this processor supports SIMD.
- */
-static void OneVFU1(float* out, const uchar* ptrIn, int iStride, const float* gPtr, int ct, int len,
-                    bool usesSimd) {
-    int x1 = 0;
-
-    while((len > x1) && (((uintptr_t)ptrIn) & 0x3)) {
-        const uchar *pi = ptrIn;
-        float blurredPixel = 0;
-        const float* gp = gPtr;
-
-        for (int r = 0; r < ct; r++) {
-            float pf = (float)pi[0];
-            blurredPixel += pf * gp[0];
-            pi += iStride;
-            gp++;
-        }
-        out[0] = blurredPixel;
-        x1++;
-        out++;
-        ptrIn++;
-        len--;
-    }
-#if defined(ARCH_X86_HAVE_SSSE3)
-    if (usesSimd && (len > x1)) {
-        int t = (len - x1) >> 2;
-        t &= ~1;
-        if (t) {
-            rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t );
-            len -= t << 2;
-            ptrIn += t << 2;
-            out += t << 2;
-        }
-    }
-#else
-    (void) usesSimd; // Avoid unused parameter warning.
-#endif
-    while(len > 0) {
-        const uchar *pi = ptrIn;
-        float blurredPixel = 0;
-        const float* gp = gPtr;
-
-        for (int r = 0; r < ct; r++) {
-            float pf = (float)pi[0];
-            blurredPixel += pf * gp[0];
-            pi += iStride;
-            gp++;
-        }
-        out[0] = blurredPixel;
-        len--;
-        out++;
-        ptrIn++;
-    }
-}
-
-/**
- * Horizontal blur of a uchar4 line.
- *
- * @param sizeX Number of cells of the input array in the horizontal direction.
- * @param out Where to place the computed value.
- * @param x Coordinate of the point we're blurring.
- * @param ptrIn The start of the input row from which we're indexing x.
- * @param gPtr The gaussian coefficients.
- * @param iradius The radius of the blur.
- */
-static void OneHU4(uint32_t sizeX, uchar4* out, int32_t x, const float4* ptrIn, const float* gPtr,
-                   int iradius) {
-    float4 blurredPixel = 0;
-    for (int r = -iradius; r <= iradius; r ++) {
-        int validX = std::max((x + r), 0);
-        validX = std::min(validX, (int)(sizeX - 1));
-        float4 pf = ptrIn[validX];
-        blurredPixel += pf * gPtr[0];
-        gPtr++;
-    }
-
-    out->xyzw = convert<uchar4>(blurredPixel);
-}
-
-/**
- * Horizontal blur of a uchar line.
- *
- * @param sizeX Number of cells of the input array in the horizontal direction.
- * @param out Where to place the computed value.
- * @param x Coordinate of the point we're blurring.
- * @param ptrIn The start of the input row from which we're indexing x.
- * @param gPtr The gaussian coefficients.
- * @param iradius The radius of the blur.
- */
-static void OneHU1(uint32_t sizeX, uchar* out, int32_t x, const float* ptrIn, const float* gPtr,
-                   int iradius) {
-    float blurredPixel = 0;
-    for (int r = -iradius; r <= iradius; r ++) {
-        int validX = std::max((x + r), 0);
-        validX = std::min(validX, (int)(sizeX - 1));
-        float pf = ptrIn[validX];
-        blurredPixel += pf * gPtr[0];
-        gPtr++;
-    }
-
-    out[0] = (uchar)blurredPixel;
-}
-
-/**
- * Full blur of a line of RGBA data.
- *
- * @param outPtr Where to store the results
- * @param xstart The index of the section we're starting to blur.
- * @param xend  The end index of the section.
- * @param currentY The index of the line we're blurring.
- * @param usesSimd Whether this processor supports SIMD.
- */
-void BlurTask::kernelU4(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
-                        uint32_t threadIndex) {
-    float4 stackbuf[2048];
-    float4 *buf = &stackbuf[0];
-    const uint32_t stride = mSizeX * mVectorSize;
-
-    uchar4 *out = (uchar4 *)outPtr;
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-#if defined(ARCH_ARM_USE_INTRINSICS)
-    if (mUsesSimd && mSizeX >= 4) {
-      rsdIntrinsicBlurU4_K(out, (uchar4 const *)(mIn + stride * currentY),
-                 mSizeX, mSizeY,
-                 stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
-        return;
-    }
-#endif
-
-    if (mSizeX > 2048) {
-        if ((mSizeX > mScratchSize[threadIndex]) || !mScratch[threadIndex]) {
-            // Pad the side of the allocation by one unit to allow alignment later
-            mScratch[threadIndex] = realloc(mScratch[threadIndex], (mSizeX + 1) * 16);
-            mScratchSize[threadIndex] = mSizeX;
-        }
-        // realloc only aligns to 8 bytes so we manually align to 16.
-        buf = (float4 *) ((((intptr_t)mScratch[threadIndex]) + 15) & ~0xf);
-    }
-    float4 *fout = (float4 *)buf;
-    int y = currentY;
-    if ((y > mIradius) && (y < ((int)mSizeY - mIradius))) {
-        const uchar *pi = mIn + (y - mIradius) * stride;
-        OneVFU4(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
-    } else {
-        x1 = 0;
-        while(mSizeX > x1) {
-            OneVU4(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
-            fout++;
-            x1++;
-        }
-    }
-
-    x1 = xstart;
-    while ((x1 < (uint32_t)mIradius) && (x1 < x2)) {
-        OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
-        out++;
-        x1++;
-    }
-#if defined(ARCH_X86_HAVE_SSSE3)
-    if (mUsesSimd) {
-        if ((x1 + mIradius) < x2) {
-            rsdIntrinsicBlurHFU4_K(out, buf - mIradius, mFp,
-                                   mIradius * 2 + 1, x1, x2 - mIradius);
-            out += (x2 - mIradius) - x1;
-            x1 = x2 - mIradius;
-        }
-    }
-#endif
-    while(x2 > x1) {
-        OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
-        out++;
-        x1++;
-    }
-}
-
-/**
- * Full blur of a line of U_8 data.
- *
- * @param outPtr Where to store the results
- * @param xstart The index of the section we're starting to blur.
- * @param xend  The end index of the section.
- * @param currentY The index of the line we're blurring.
- */
-void BlurTask::kernelU1(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
-    float buf[4 * 2048];
-    const uint32_t stride = mSizeX * mVectorSize;
-
-    uchar *out = (uchar *)outPtr;
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-#if defined(ARCH_ARM_USE_INTRINSICS)
-    if (mUsesSimd && mSizeX >= 16) {
-        // The specialisation for r<=8 has an awkward prefill case, which is
-        // fiddly to resolve, where starting close to the right edge can cause
-        // a read beyond the end of input.  So avoid that case here.
-        if (mIradius > 8 || (mSizeX - std::max(0, (int32_t)x1 - 8)) >= 16) {
-            rsdIntrinsicBlurU1_K(out, mIn + stride * currentY, mSizeX, mSizeY,
-                     stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
-            return;
-        }
-    }
-#endif
-
-    float *fout = (float *)buf;
-    int y = currentY;
-    if ((y > mIradius) && (y < ((int)mSizeY - mIradius -1))) {
-        const uchar *pi = mIn + (y - mIradius) * stride;
-        OneVFU1(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
-    } else {
-        x1 = 0;
-        while(mSizeX > x1) {
-            OneVU1(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
-            fout++;
-            x1++;
-        }
-    }
-
-    x1 = xstart;
-    while ((x1 < x2) &&
-           ((x1 < (uint32_t)mIradius) || (((uintptr_t)out) & 0x3))) {
-        OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
-        out++;
-        x1++;
-    }
-#if defined(ARCH_X86_HAVE_SSSE3)
-    if (mUsesSimd) {
-        if ((x1 + mIradius) < x2) {
-            uint32_t len = x2 - (x1 + mIradius);
-            len &= ~3;
-
-            // rsdIntrinsicBlurHFU1_K() processes each four float values in |buf| at once, so it
-            // nees to ensure four more values can be accessed in order to avoid accessing
-            // uninitialized buffer.
-            if (len > 4) {
-                len -= 4;
-                rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - mIradius, mFp,
-                                       mIradius * 2 + 1, x1, x1 + len);
-                out += len;
-                x1 += len;
-            }
-        }
-    }
-#endif
-    while(x2 > x1) {
-        OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
-        out++;
-        x1++;
-    }
-}
-
-void BlurTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                           size_t endY) {
-    for (size_t y = startY; y < endY; y++) {
-        void* outPtr = outArray + (mSizeX * y + startX) * mVectorSize;
-        if (mVectorSize == 4) {
-            kernelU4(outPtr, startX, endX, y, threadIndex);
-        } else {
-            kernelU1(outPtr, startX, endX, y);
-        }
-    }
-}
-
-void RenderScriptToolkit::blur(const uint8_t* in, uint8_t* out, size_t sizeX, size_t sizeY,
-                               size_t vectorSize, int radius, const Restriction* restriction) {
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
-    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
-        return;
-    }
-    if (radius <= 0 || radius > 25) {
-        ALOGE("The radius should be between 1 and 25. %d provided.", radius);
-    }
-    if (vectorSize != 1 && vectorSize != 4) {
-        ALOGE("The vectorSize should be 1 or 4. %zu provided.", vectorSize);
-    }
-#endif
-
-    BlurTask task(in, out, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), radius,
-                  restriction);
-    processor->doTask(&task);
-}
-
-}  // namespace renderscript
-}  // namespace android

diff --git a/toolkit/Blur_advsimd.S b/toolkit/Blur_advsimd.S
deleted file mode 100644
index 6d3cb8d..0000000
--- a/toolkit/Blur_advsimd.S
+++ /dev/null

@@ -1,1868 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
-#define PRIVATE(f) .text; .align 4; .type f,#function; f:
-#define END(f) .size f, .-f;
-
-//#define ARCH_ARM64_USE_BLUR_PRELOAD
-
-/* Number of fractional bits to preserve in intermediate results.  The
- * intermediate storage is 16-bit, and we started with 8 bit data (the integer
- * part), so this should be between 0 and 8.
- */
-.set FRACTION_BITS, 7
-.set MAX_R, 25
-
-
-/* A quick way of making a line of code conditional on some other condition.
- * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
- * `ifcc`:
- */
-.macro ifcc zzz:vararg
-.if cc
-            \zzz
-.endif
-.endm
-
-/* It's not always clear that prefetching is beneficial and this needs further
- * testing on different cores, so it's made switchable here.
- */
-#if defined(ARCH_ARM64_USE_BLUR_PRELOAD)
-#define VERTPLD(...) prfm        PLDL1KEEP, [__VA_ARGS__]
-#else
-#define VERTPLD(...) nop
-#endif
-
-/* Fetch 16 columns of bytes (regardless of image format), convolve these
- * vertically, and leave them in the register file.  If working near the top or
- * bottom of an image then clamp the addressing while loading the data in.
- *
- * The convolution is fully unrolled for windows up to max_r, with the
- * outermost edges calculated first.  This way it's possible to branch directly
- * into the relevant part of the code for an arbitrary convolution radius.  Two
- * variants of the loop are produced; one eliminates the clamping code for a
- * slight speed advantage.
- *
- * Where the macro is called with reg=x, the specified register is taken to
- * contain a pre-calculated pointer into one of the two loops.
- *
- * Input:
- *      x1 -- src
- *      x2 -- pitch
- *      x5 -- r
- *      x6 -- rup (r, unless clipped to top of source image)
- *      x7 -- rdn (r, unless clipped to bottom of source image)
- *      x12 -- switch index
- *      v0-v3 -- coefficient table
- *      x13 = -pitch
- *      x15 = top-row in
- *      x19 = bottom-row in
- * Output:
- *      x1 += 16
- *      v10,v11 -- 16 convolved columns
- * Modifies:
- *      x10 = upper row pointer
- *      x11 = lower row pointer
- *      v12-v15 = temporary sums
- */
-.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
-  .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
-
-            ld1         {v15.16b}, [x1], #16
-            mov         x10, x15
-
-            uxtl        v14.8h, v15.8b
-            VERTPLD(x1, #16)
-            uxtl2       v15.8h, v15.16b
-  .if \max_r < 16 // approximate
-    ifcc    adr         \reg, 1f
-  .else
-    ifcc    adrp        \reg, 1f
-    ifcc    add         \reg, \reg, #:lo12:1f
-  .endif
-
-            umull       v12.4s, v14.4h, v0.h[0]
-    ifcc    sub         \reg, \reg, x5, LSL #6
-            umull2      v13.4s, v14.8h, v0.h[0]
-            mov         x11, x19
-            umull       v14.4s, v15.4h, v0.h[0]
-    ifcc    add         \reg, \reg, x5, LSL #3
-            umull2      v15.4s, v15.8h, v0.h[0]
-            br          \reg
-
-  /* This version of the vertical fetch loop body is used away from the edges
-   * of the source image.  The pointers start at the top and bottom source rows
-   * and work their way towards the centre on each iteration.  This way the
-   * number of taps used can be controlled by jumping directly into the middle
-   * of the loop and running to completion.
-   * If the loop body changes size then the code which calculates the address of
-   * the initial iteration must be updated to accordingly.
-   */
-  .macro vertfetch_noclamp i, dreg
-    .if 0 < \i && \i <= \max_r
-            ld1         {v10.16b}, [x10], x2
-            ld1         {v11.16b}, [x11], x13
-            uaddl       v16.8h, v10.8b, v11.8b
-            uaddl2      v11.8h, v10.16b, v11.16b
-            umlal       v12.4s, v16.4h, \dreg
-            umlal2      v13.4s, v16.8h, \dreg
-            VERTPLD(x10, #32)
-            umlal       v14.4s, v11.4h, \dreg
-            VERTPLD(x11, #32)
-            umlal2      v15.4s, v11.8h, \dreg
-    .endif
-  .endm
-
-  /* This version of the vertical fetch loop body is used near the edges of the
-   * source image, where one or both of the accesses may start with a clamped
-   * value, and the row addresses only begin to change after some number of
-   * iterations before the end.
-   * If the loop body changes size then the code which calculates the address of
-   * the initial iteration must be updated to accordingly.
-   */
-  .macro vertfetch_clamped i, dreg
-    .if 0 < \i && \i <= \max_r
-            ld1         {v10.16b}, [x10], x2
-            cmp         x6, #\i
-            ld1         {v11.16b}, [x11], x13
-            csel        x10, x15, x10, lo
-            uaddl       v16.8h, v10.8b, v11.8b
-            cmp         x7, #\i
-            uaddl2      v11.8h, v10.16b, v11.16b
-            csel        x11, x19, x11, lo
-            umlal       v12.4s, v16.4h, \dreg
-            umlal2      v13.4s, v16.8h, \dreg
-            VERTPLD(x10, #32)
-            umlal       v14.4s, v11.4h, \dreg
-            VERTPLD(x11, #32)
-            umlal2      v15.4s, v11.8h, \dreg
-    .endif
-  .endm
-
-  /* Entry into this unrolled loop is computed as a negative index from
-   * \labelc at the end of the block.
-   */
-  .align 4
-  vertfetch_clamped 27, v3.h[3]
-  vertfetch_clamped 26, v3.h[2]
-  vertfetch_clamped 25, v3.h[1]
-  vertfetch_clamped 24, v3.h[0]
-  vertfetch_clamped 23, v2.h[7]
-  vertfetch_clamped 22, v2.h[6]
-  vertfetch_clamped 21, v2.h[5]
-  vertfetch_clamped 20, v2.h[4]
-  vertfetch_clamped 19, v2.h[3]
-  vertfetch_clamped 18, v2.h[2]
-  vertfetch_clamped 17, v2.h[1]
-  vertfetch_clamped 16, v2.h[0]
-  vertfetch_clamped 15, v1.h[7]
-  vertfetch_clamped 14, v1.h[6]
-  vertfetch_clamped 13, v1.h[5]
-  vertfetch_clamped 12, v1.h[4]
-  vertfetch_clamped 11, v1.h[3]
-  vertfetch_clamped 10, v1.h[2]
-  vertfetch_clamped  9, v1.h[1]
-  vertfetch_clamped  8, v1.h[0]
-  vertfetch_clamped  7, v0.h[7]
-  vertfetch_clamped  6, v0.h[6]
-  vertfetch_clamped  5, v0.h[5]
-  vertfetch_clamped  4, v0.h[4]
-  vertfetch_clamped  3, v0.h[3]
-  vertfetch_clamped  2, v0.h[2]
-  vertfetch_clamped  1, v0.h[1]
-  vertfetch_clamped  0, v0.h[0]
-  1:
-  \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
-
-  /* Entry into this unrolled loop is computed as a negative index from
-   * \labelnc at the end of the block.
-   */
-  .align 4
-  vertfetch_noclamp 27, v3.h[3]
-  vertfetch_noclamp 26, v3.h[2]
-  vertfetch_noclamp 25, v3.h[1]
-  vertfetch_noclamp 24, v3.h[0]
-  vertfetch_noclamp 23, v2.h[7]
-  vertfetch_noclamp 22, v2.h[6]
-  vertfetch_noclamp 21, v2.h[5]
-  vertfetch_noclamp 20, v2.h[4]
-  vertfetch_noclamp 19, v2.h[3]
-  vertfetch_noclamp 18, v2.h[2]
-  vertfetch_noclamp 17, v2.h[1]
-  vertfetch_noclamp 16, v2.h[0]
-  vertfetch_noclamp 15, v1.h[7]
-  vertfetch_noclamp 14, v1.h[6]
-  vertfetch_noclamp 13, v1.h[5]
-  vertfetch_noclamp 12, v1.h[4]
-  vertfetch_noclamp 11, v1.h[3]
-  vertfetch_noclamp 10, v1.h[2]
-  vertfetch_noclamp  9, v1.h[1]
-  vertfetch_noclamp  8, v1.h[0]
-  vertfetch_noclamp  7, v0.h[7]
-  vertfetch_noclamp  6, v0.h[6]
-  vertfetch_noclamp  5, v0.h[5]
-  vertfetch_noclamp  4, v0.h[4]
-  vertfetch_noclamp  3, v0.h[3]
-  vertfetch_noclamp  2, v0.h[2]
-  vertfetch_noclamp  1, v0.h[1]
-  vertfetch_noclamp  0, v0.h[0]
-  \labelnc :
-
-  .purgem vertfetch_clamped
-  .purgem vertfetch_noclamp
-
-  2:        uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
-            add         x15, x15, #16
-            uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
-            add         x19, x19, #16
-            uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
-            uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
-.endm /*}}}*/
-
-/* Some portion of the convolution window (as much as will fit, and all of it
- * for the uchar1 cases) is kept in the register file to avoid unnecessary
- * memory accesses.  This forces the horizontal loops to be unrolled because
- * there's no indexed addressing into the register file.
- *
- * As in the fetch macro, the operations are ordered from outside to inside, so
- * that jumping into the middle of the block bypasses the unwanted window taps.
- *
- * There are several variants of the macro because of the fixed offets of the
- * taps -- the wider the maximum radius the further the centre tap is from the
- * most recently fetched data.  This means that pre-filling the window requires
- * more data that won't be used and it means that rotating the window involves
- * more mov operations.
- *
- * When the buffer gets too big the buffer at [x9] is used.
- *
- * Input:
- *      v16-v31,v4-v11 -- convoltion window
- *      x9 -- pointer to additional convolution window data
- * Output:
- *      x9 -- updated buffer pointer (if used)
- *      d31 -- result to be stored
- * Modifies:
- *      x12 -- temp buffer pointer
- *      v12-v13 -- temporaries for load and vext operations.
- *      v14-v15 -- intermediate sums
- */
-#define TUNED_LIST1 8, 16
-.macro hconv1_8/*{{{*/
-
-.rodata
-    200:    .hword -4
-            .hword 101f-100f
-            .hword 102f-100f
-            .hword 103f-100f
-            .hword 104f-100f
-            .hword 105f-100f
-            .hword 106f-100f
-            .hword 107f-100f
-            .hword 108f-100f
-            .align      4
-.text
-            umull       v14.4s, v9.4h, v0.h[0]
-            umull2      v15.4s, v9.8h, v0.h[0]
-
-            adrp        x16, 200b
-            add         x16, x16, :lo12:200b
-            ldrsh       x12, [x16, x5, LSL #1]
-            adr         x16, 100f
-            add         x12, x12, x16
-    100:    br          x12
-    108:    umlal       v14.4s, v8.4h, v1.h[0]
-            umlal2      v15.4s, v8.8h, v1.h[0]
-            umlal       v14.4s, v10.4h, v1.h[0]
-            umlal2      v15.4s, v10.8h, v1.h[0]
-    107:    ext         v12.16b, v8.16b, v9.16b, #1*2
-            ext         v13.16b, v9.16b, v10.16b, #7*2
-            umlal       v14.4s, v12.4h, v0.h[7]
-            umlal2      v15.4s, v12.8h, v0.h[7]
-            umlal       v14.4s, v13.4h, v0.h[7]
-            umlal2      v15.4s, v13.8h, v0.h[7]
-    106:    ext         v12.16b, v8.16b, v9.16b, #2*2
-            ext         v13.16b, v9.16b, v10.16b, #6*2
-            umlal       v14.4s, v12.4h, v0.h[6]
-            umlal2      v15.4s, v12.8h, v0.h[6]
-            umlal       v14.4s, v13.4h, v0.h[6]
-            umlal2      v15.4s, v13.8h, v0.h[6]
-    105:    ext         v12.16b, v8.16b, v9.16b, #3*2
-            ext         v13.16b, v9.16b, v10.16b, #5*2
-            umlal       v14.4s, v12.4h, v0.h[5]
-            umlal2      v15.4s, v12.8h, v0.h[5]
-            umlal       v14.4s, v13.4h, v0.h[5]
-            umlal2      v15.4s, v13.8h, v0.h[5]
-    104:    //ext         v12.16b, v8.16b, v9.16b, #4*2
-            //ext         v13.16b, v9.16b, v10.16b, #4*2
-            umlal2      v14.4s, v8.8h, v0.h[4]
-            umlal       v15.4s, v9.4h, v0.h[4]
-            umlal2      v14.4s, v9.8h, v0.h[4]
-            umlal       v15.4s, v10.4h, v0.h[4]
-    103:    ext         v12.16b, v8.16b, v9.16b, #5*2
-            ext         v13.16b, v9.16b, v10.16b, #3*2
-            umlal       v14.4s, v12.4h, v0.h[3]
-            umlal2      v15.4s, v12.8h, v0.h[3]
-            umlal       v14.4s, v13.4h, v0.h[3]
-            umlal2      v15.4s, v13.8h, v0.h[3]
-    102:    ext         v12.16b, v8.16b, v9.16b, #6*2
-            ext         v13.16b, v9.16b, v10.16b, #2*2
-            umlal       v14.4s, v12.4h, v0.h[2]
-            umlal2      v15.4s, v12.8h, v0.h[2]
-            umlal       v14.4s, v13.4h, v0.h[2]
-            umlal2      v15.4s, v13.8h, v0.h[2]
-    101:    ext         v12.16b, v8.16b, v9.16b, #7*2
-            ext         v13.16b, v9.16b, v10.16b, #1*2
-            umlal       v14.4s, v12.4h, v0.h[1]
-            umlal2      v15.4s, v12.8h, v0.h[1]
-            umlal       v14.4s, v13.4h, v0.h[1]
-            umlal2      v15.4s, v13.8h, v0.h[1]
-
-            uqrshrn     v14.4h, v14.4s, #16
-            uqrshrn2    v14.8h, v15.4s, #16
-            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
-
-            mov         v8.16b, v9.16b
-            mov         v9.16b, v10.16b
-            mov         v10.16b, v11.16b
-.endm/*}}}*/
-
-.macro hconv1_16/*{{{*/
-.rodata
-   200:     .hword -4
-            .hword 101f-100f
-            .hword 102f-100f
-            .hword 103f-100f
-            .hword 104f-100f
-            .hword 105f-100f
-            .hword 106f-100f
-            .hword 107f-100f
-            .hword 108f-100f
-            .hword 109f-100f
-            .hword 110f-100f
-            .hword 111f-100f
-            .hword 112f-100f
-            .hword 113f-100f
-            .hword 114f-100f
-            .hword 115f-100f
-            .hword 116f-100f
-            .align 4
-
-.text
-            umull       v14.4s, v8.4h, v0.h[0]
-            umull2      v15.4s, v8.8h, v0.h[0]
-
-            adrp        x16, 200b
-            add         x16, x16, :lo12:200b
-            ldrsh       x12, [x16, x5, LSL #1]
-            adr         x16, 100f
-            add         x12, x12, x16
-    100:    br          x12
-    116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
-            //ext         v13.16b, v10.16b, v11.16b, #0*2
-            umlal       v14.4s, v6.4h, v2.h[0]
-            umlal2      v15.4s, v6.8h, v2.h[0]
-            umlal       v14.4s, v10.4h, v2.h[0]
-            umlal2      v15.4s, v10.8h, v2.h[0]
-    115:    ext         v12.16b, v6.16b, v7.16b, #1*2
-            ext         v13.16b, v9.16b, v10.16b, #7*2
-            umlal       v14.4s, v12.4h, v1.h[7]
-            umlal2      v15.4s, v12.8h, v1.h[7]
-            umlal       v14.4s, v13.4h, v1.h[7]
-            umlal2      v15.4s, v13.8h, v1.h[7]
-    114:    ext         v12.16b, v6.16b, v7.16b, #2*2
-            ext         v13.16b, v9.16b, v10.16b, #6*2
-            umlal       v14.4s, v12.4h, v1.h[6]
-            umlal2      v15.4s, v12.8h, v1.h[6]
-            umlal       v14.4s, v13.4h, v1.h[6]
-            umlal2      v15.4s, v13.8h, v1.h[6]
-    113:    ext         v12.16b, v6.16b, v7.16b, #3*2
-            ext         v13.16b, v9.16b, v10.16b, #5*2
-            umlal       v14.4s, v12.4h, v1.h[5]
-            umlal2      v15.4s, v12.8h, v1.h[5]
-            umlal       v14.4s, v13.4h, v1.h[5]
-            umlal2      v15.4s, v13.8h, v1.h[5]
-    112:    //ext         v12.16b, v6.16b, v7.16b, #4*2
-            //ext         v13.16b, v9.16b, v10.16b, #4*2
-            umlal2      v14.4s, v6.8h, v1.h[4]
-            umlal       v15.4s, v7.4h, v1.h[4]
-            umlal2      v14.4s, v9.8h, v1.h[4]
-            umlal       v15.4s, v10.4h, v1.h[4]
-    111:    ext         v12.16b, v6.16b, v7.16b, #5*2
-            ext         v13.16b, v9.16b, v10.16b, #3*2
-            umlal       v14.4s, v12.4h, v1.h[3]
-            umlal2      v15.4s, v12.8h, v1.h[3]
-            umlal       v14.4s, v13.4h, v1.h[3]
-            umlal2      v15.4s, v13.8h, v1.h[3]
-    110:    ext         v12.16b, v6.16b, v7.16b, #6*2
-            ext         v13.16b, v9.16b, v10.16b, #2*2
-            umlal       v14.4s, v12.4h, v1.h[2]
-            umlal2      v15.4s, v12.8h, v1.h[2]
-            umlal       v14.4s, v13.4h, v1.h[2]
-            umlal2      v15.4s, v13.8h, v1.h[2]
-    109:    ext         v12.16b, v6.16b, v7.16b, #7*2
-            ext         v13.16b, v9.16b, v10.16b, #1*2
-            umlal       v14.4s, v12.4h, v1.h[1]
-            umlal2      v15.4s, v12.8h, v1.h[1]
-            umlal       v14.4s, v13.4h, v1.h[1]
-            umlal2      v15.4s, v13.8h, v1.h[1]
-    108:    //ext         v12.16b, v7.16b, v8.16b, #0*2
-            //ext         v13.16b, v9.16b, v10.16b, #0*2
-            umlal       v14.4s, v7.4h, v1.h[0]
-            umlal2      v15.4s, v7.8h, v1.h[0]
-            umlal       v14.4s, v9.4h, v1.h[0]
-            umlal2      v15.4s, v9.8h, v1.h[0]
-    107:    ext         v12.16b, v7.16b, v8.16b, #1*2
-            ext         v13.16b, v8.16b, v9.16b, #7*2
-            umlal       v14.4s, v12.4h, v0.h[7]
-            umlal2      v15.4s, v12.8h, v0.h[7]
-            umlal       v14.4s, v13.4h, v0.h[7]
-            umlal2      v15.4s, v13.8h, v0.h[7]
-    106:    ext         v12.16b, v7.16b, v8.16b, #2*2
-            ext         v13.16b, v8.16b, v9.16b, #6*2
-            umlal       v14.4s, v12.4h, v0.h[6]
-            umlal2      v15.4s, v12.8h, v0.h[6]
-            umlal       v14.4s, v13.4h, v0.h[6]
-            umlal2      v15.4s, v13.8h, v0.h[6]
-    105:    ext         v12.16b, v7.16b, v8.16b, #3*2
-            ext         v13.16b, v8.16b, v9.16b, #5*2
-            umlal       v14.4s, v12.4h, v0.h[5]
-            umlal2      v15.4s, v12.8h, v0.h[5]
-            umlal       v14.4s, v13.4h, v0.h[5]
-            umlal2      v15.4s, v13.8h, v0.h[5]
-    104:    //ext         v12.16b, v7.16b, v8.16b, #4*2
-            //ext         v13.16b, v8.16b, v9.16b, #4*2
-            umlal2      v14.4s, v7.8h, v0.h[4]
-            umlal       v15.4s, v8.4h, v0.h[4]
-            umlal2      v14.4s, v8.8h, v0.h[4]
-            umlal       v15.4s, v9.4h, v0.h[4]
-    103:    ext         v12.16b, v7.16b, v8.16b, #5*2
-            ext         v13.16b, v8.16b, v9.16b, #3*2
-            umlal       v14.4s, v12.4h, v0.h[3]
-            umlal2      v15.4s, v12.8h, v0.h[3]
-            umlal       v14.4s, v13.4h, v0.h[3]
-            umlal2      v15.4s, v13.8h, v0.h[3]
-    102:    ext         v12.16b, v7.16b, v8.16b, #6*2
-            ext         v13.16b, v8.16b, v9.16b, #2*2
-            umlal       v14.4s, v12.4h, v0.h[2]
-            umlal2      v15.4s, v12.8h, v0.h[2]
-            umlal       v14.4s, v13.4h, v0.h[2]
-            umlal2      v15.4s, v13.8h, v0.h[2]
-    101:    ext         v12.16b, v7.16b, v8.16b, #7*2
-            ext         v13.16b, v8.16b, v9.16b, #1*2
-            umlal       v14.4s, v12.4h, v0.h[1]
-            umlal2      v15.4s, v12.8h, v0.h[1]
-            umlal       v14.4s, v13.4h, v0.h[1]
-            umlal2      v15.4s, v13.8h, v0.h[1]
-
-            uqrshrn     v14.4h, v14.4s, #16
-            uqrshrn2    v14.8h, v15.4s, #16
-            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
-
-            mov         v6.16b, v7.16b
-            mov         v7.16b, v8.16b
-            mov         v8.16b, v9.16b
-            mov         v9.16b, v10.16b
-            mov         v10.16b, v11.16b
-.endm/*}}}*/
-
-.macro hconv1_25/*{{{*/
-.rodata
-   200:     .hword -4
-            .hword 101f-100f
-            .hword 102f-100f
-            .hword 103f-100f
-            .hword 104f-100f
-            .hword 105f-100f
-            .hword 106f-100f
-            .hword 107f-100f
-            .hword 108f-100f
-            .hword 109f-100f
-            .hword 110f-100f
-            .hword 111f-100f
-            .hword 112f-100f
-            .hword 113f-100f
-            .hword 114f-100f
-            .hword 115f-100f
-            .hword 116f-100f
-            .hword 117f-100f
-            .hword 118f-100f
-            .hword 119f-100f
-            .hword 120f-100f
-            .hword 121f-100f
-            .hword 122f-100f
-            .hword 123f-100f
-            .hword 124f-100f
-            .hword 125f-100f
-            .align 4
-.text
-            ext         v12.16b, v6.16b, v7.16b, #7*2
-            umull       v14.4s, v12.4h, v0.h[0]
-            umull2      v15.4s, v12.8h, v0.h[0]
-
-            adrp        x16, 200b
-            add         x16, x16, :lo12:200b
-            ldrsh       x12, [x16, x5, LSL #1]
-            adr         x16, 100f
-            add         x12, x12, x16
-    100:    br          x12
-    125:    ext         v12.16b, v31.16b, v4.16b, #6*2
-            ext         v13.16b, v10.16b, v11.16b, #0*2
-            umlal       v14.4s, v12.4h, v3.h[1]
-            umlal2      v15.4s, v12.8h, v3.h[1]
-            umlal       v14.4s, v13.4h, v3.h[1]
-            umlal2      v15.4s, v13.8h, v3.h[1]
-    124:    ext         v12.16b, v31.16b, v4.16b, #7*2
-            ext         v13.16b, v9.16b, v10.16b, #7*2
-            umlal       v14.4s, v12.4h, v3.h[0]
-            umlal2      v15.4s, v12.8h, v3.h[0]
-            umlal       v14.4s, v13.4h, v3.h[0]
-            umlal2      v15.4s, v13.8h, v3.h[0]
-    123:    ext         v12.16b, v4.16b, v5.16b, #0*2
-            ext         v13.16b, v9.16b, v10.16b, #6*2
-            umlal       v14.4s, v12.4h, v2.h[7]
-            umlal2      v15.4s, v12.8h, v2.h[7]
-            umlal       v14.4s, v13.4h, v2.h[7]
-            umlal2      v15.4s, v13.8h, v2.h[7]
-    122:    ext         v12.16b, v4.16b, v5.16b, #1*2
-            ext         v13.16b, v9.16b, v10.16b, #5*2
-            umlal       v14.4s, v12.4h, v2.h[6]
-            umlal2      v15.4s, v12.8h, v2.h[6]
-            umlal       v14.4s, v13.4h, v2.h[6]
-            umlal2      v15.4s, v13.8h, v2.h[6]
-    121:    ext         v12.16b, v4.16b, v5.16b, #2*2
-            ext         v13.16b, v9.16b, v10.16b, #4*2
-            umlal       v14.4s, v12.4h, v2.h[5]
-            umlal2      v15.4s, v12.8h, v2.h[5]
-            umlal       v14.4s, v13.4h, v2.h[5]
-            umlal2      v15.4s, v13.8h, v2.h[5]
-    120:    ext         v12.16b, v4.16b, v5.16b, #3*2
-            ext         v13.16b, v9.16b, v10.16b, #3*2
-            umlal       v14.4s, v12.4h, v2.h[4]
-            umlal2      v15.4s, v12.8h, v2.h[4]
-            umlal       v14.4s, v13.4h, v2.h[4]
-            umlal2      v15.4s, v13.8h, v2.h[4]
-    119:    ext         v12.16b, v4.16b, v5.16b, #4*2
-            ext         v13.16b, v9.16b, v10.16b, #2*2
-            umlal       v14.4s, v12.4h, v2.h[3]
-            umlal2      v15.4s, v12.8h, v2.h[3]
-            umlal       v14.4s, v13.4h, v2.h[3]
-            umlal2      v15.4s, v13.8h, v2.h[3]
-    118:    ext         v12.16b, v4.16b, v5.16b, #5*2
-            ext         v13.16b, v9.16b, v10.16b, #1*2
-            umlal       v14.4s, v12.4h, v2.h[2]
-            umlal2      v15.4s, v12.8h, v2.h[2]
-            umlal       v14.4s, v13.4h, v2.h[2]
-            umlal2      v15.4s, v13.8h, v2.h[2]
-    117:    ext         v12.16b, v4.16b, v5.16b, #6*2
-            ext         v13.16b, v9.16b, v10.16b, #0*2
-            umlal       v14.4s, v12.4h, v2.h[1]
-            umlal2      v15.4s, v12.8h, v2.h[1]
-            umlal       v14.4s, v13.4h, v2.h[1]
-            umlal2      v15.4s, v13.8h, v2.h[1]
-    116:    ext         v12.16b, v4.16b, v5.16b, #7*2
-            ext         v13.16b, v8.16b, v9.16b, #7*2
-            umlal       v14.4s, v12.4h, v2.h[0]
-            umlal2      v15.4s, v12.8h, v2.h[0]
-            umlal       v14.4s, v13.4h, v2.h[0]
-            umlal2      v15.4s, v13.8h, v2.h[0]
-    115:    ext         v12.16b, v5.16b, v6.16b, #0*2
-            ext         v13.16b, v8.16b, v9.16b, #6*2
-            umlal       v14.4s, v12.4h, v1.h[7]
-            umlal2      v15.4s, v12.8h, v1.h[7]
-            umlal       v14.4s, v13.4h, v1.h[7]
-            umlal2      v15.4s, v13.8h, v1.h[7]
-    114:    ext         v12.16b, v5.16b, v6.16b, #1*2
-            ext         v13.16b, v8.16b, v9.16b, #5*2
-            umlal       v14.4s, v12.4h, v1.h[6]
-            umlal2      v15.4s, v12.8h, v1.h[6]
-            umlal       v14.4s, v13.4h, v1.h[6]
-            umlal2      v15.4s, v13.8h, v1.h[6]
-    113:    ext         v12.16b, v5.16b, v6.16b, #2*2
-            ext         v13.16b, v8.16b, v9.16b, #4*2
-            umlal       v14.4s, v12.4h, v1.h[5]
-            umlal2      v15.4s, v12.8h, v1.h[5]
-            umlal       v14.4s, v13.4h, v1.h[5]
-            umlal2      v15.4s, v13.8h, v1.h[5]
-    112:    ext         v12.16b, v5.16b, v6.16b, #3*2
-            ext         v13.16b, v8.16b, v9.16b, #3*2
-            umlal       v14.4s, v12.4h, v1.h[4]
-            umlal2      v15.4s, v12.8h, v1.h[4]
-            umlal       v14.4s, v13.4h, v1.h[4]
-            umlal2      v15.4s, v13.8h, v1.h[4]
-    111:    ext         v12.16b, v5.16b, v6.16b, #4*2
-            ext         v13.16b, v8.16b, v9.16b, #2*2
-            umlal       v14.4s, v12.4h, v1.h[3]
-            umlal2      v15.4s, v12.8h, v1.h[3]
-            umlal       v14.4s, v13.4h, v1.h[3]
-            umlal2      v15.4s, v13.8h, v1.h[3]
-    110:    ext         v12.16b, v5.16b, v6.16b, #5*2
-            ext         v13.16b, v8.16b, v9.16b, #1*2
-            umlal       v14.4s, v12.4h, v1.h[2]
-            umlal2      v15.4s, v12.8h, v1.h[2]
-            umlal       v14.4s, v13.4h, v1.h[2]
-            umlal2      v15.4s, v13.8h, v1.h[2]
-    109:    ext         v12.16b, v5.16b, v6.16b, #6*2
-            ext         v13.16b, v8.16b, v9.16b, #0*2
-            umlal       v14.4s, v12.4h, v1.h[1]
-            umlal2      v15.4s, v12.8h, v1.h[1]
-            umlal       v14.4s, v13.4h, v1.h[1]
-            umlal2      v15.4s, v13.8h, v1.h[1]
-    108:    ext         v12.16b, v5.16b, v6.16b, #7*2
-            ext         v13.16b, v7.16b, v8.16b, #7*2
-            umlal       v14.4s, v12.4h, v1.h[0]
-            umlal2      v15.4s, v12.8h, v1.h[0]
-            umlal       v14.4s, v13.4h, v1.h[0]
-            umlal2      v15.4s, v13.8h, v1.h[0]
-    107:    ext         v12.16b, v6.16b, v7.16b, #0*2
-            ext         v13.16b, v7.16b, v8.16b, #6*2
-            umlal       v14.4s, v12.4h, v0.h[7]
-            umlal2      v15.4s, v12.8h, v0.h[7]
-            umlal       v14.4s, v13.4h, v0.h[7]
-            umlal2      v15.4s, v13.8h, v0.h[7]
-    106:    ext         v12.16b, v6.16b, v7.16b, #1*2
-            ext         v13.16b, v7.16b, v8.16b, #5*2
-            umlal       v14.4s, v12.4h, v0.h[6]
-            umlal2      v15.4s, v12.8h, v0.h[6]
-            umlal       v14.4s, v13.4h, v0.h[6]
-            umlal2      v15.4s, v13.8h, v0.h[6]
-    105:    ext         v12.16b, v6.16b, v7.16b, #2*2
-            ext         v13.16b, v7.16b, v8.16b, #4*2
-            umlal       v14.4s, v12.4h, v0.h[5]
-            umlal2      v15.4s, v12.8h, v0.h[5]
-            umlal       v14.4s, v13.4h, v0.h[5]
-            umlal2      v15.4s, v13.8h, v0.h[5]
-    104:    ext         v12.16b, v6.16b, v7.16b, #3*2
-            ext         v13.16b, v7.16b, v8.16b, #3*2
-            umlal       v14.4s, v12.4h, v0.h[4]
-            umlal2      v15.4s, v12.8h, v0.h[4]
-            umlal       v14.4s, v13.4h, v0.h[4]
-            umlal2      v15.4s, v13.8h, v0.h[4]
-    103:    ext         v12.16b, v6.16b, v7.16b, #4*2
-            ext         v13.16b, v7.16b, v8.16b, #2*2
-            umlal       v14.4s, v12.4h, v0.h[3]
-            umlal2      v15.4s, v12.8h, v0.h[3]
-            umlal       v14.4s, v13.4h, v0.h[3]
-            umlal2      v15.4s, v13.8h, v0.h[3]
-    102:    ext         v12.16b, v6.16b, v7.16b, #5*2
-            ext         v13.16b, v7.16b, v8.16b, #1*2
-            umlal       v14.4s, v12.4h, v0.h[2]
-            umlal2      v15.4s, v12.8h, v0.h[2]
-            umlal       v14.4s, v13.4h, v0.h[2]
-            umlal2      v15.4s, v13.8h, v0.h[2]
-    101:    ext         v12.16b, v6.16b, v7.16b, #6*2
-            ext         v13.16b, v7.16b, v8.16b, #0*2
-            umlal       v14.4s, v12.4h, v0.h[1]
-            umlal2      v15.4s, v12.8h, v0.h[1]
-            umlal       v14.4s, v13.4h, v0.h[1]
-            umlal2      v15.4s, v13.8h, v0.h[1]
-
-            uqrshrn     v14.4h, v14.4s, #16
-            uqrshrn2    v14.8h, v15.4s, #16
-            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
-
-            mov         v31.16b, v4.16b
-            mov         v4.16b, v5.16b
-            mov         v5.16b, v6.16b
-            mov         v6.16b, v7.16b
-            mov         v7.16b, v8.16b
-            mov         v8.16b, v9.16b
-            mov         v9.16b, v10.16b
-            mov         v10.16b, v11.16b
-.endm/*}}}*/
-
-#define TUNED_LIST4 6, 12, 20
-.macro hconv4_6/*{{{*/
-.rodata
-   200:     .hword -4
-            .hword 101f-100f
-            .hword 102f-100f
-            .hword 103f-100f
-            .hword 104f-100f
-            .hword 105f-100f
-            .hword 106f-100f
-            .align      4
-.text
-            umull       v14.4s, v7.4h, v0.h[0]
-            umull2      v15.4s, v7.8h, v0.h[0]
-
-            adrp        x16, 200b
-            add         x16, x16, :lo12:200b
-            ldrsh       x12, [x16, x5, LSL #1]
-            adr         x16, 100f
-            add         x12, x12, x16
-    100:    br          x12
-    106:    umlal       v14.4s, v4.4h,  v0.h[6]
-            umlal2      v15.4s, v4.8h,  v0.h[6]
-            umlal       v14.4s, v10.4h, v0.h[6]
-            umlal2      v15.4s, v10.8h, v0.h[6]
-    105:    umlal2      v14.4s, v4.8h,  v0.h[5]
-            umlal       v15.4s, v5.4h, v0.h[5]
-            umlal2      v14.4s, v9.8h, v0.h[5]
-            umlal       v15.4s, v10.4h, v0.h[5]
-    104:    umlal       v14.4s, v5.4h, v0.h[4]
-            umlal2      v15.4s, v5.8h, v0.h[4]
-            umlal       v14.4s, v9.4h, v0.h[4]
-            umlal2      v15.4s, v9.8h, v0.h[4]
-    103:    umlal2      v14.4s, v5.8h, v0.h[3]
-            umlal       v15.4s, v6.4h, v0.h[3]
-            umlal2      v14.4s, v8.8h, v0.h[3]
-            umlal       v15.4s, v9.4h, v0.h[3]
-    102:    umlal       v14.4s, v6.4h, v0.h[2]
-            umlal2      v15.4s, v6.8h, v0.h[2]
-            umlal       v14.4s, v8.4h, v0.h[2]
-            umlal2      v15.4s, v8.8h, v0.h[2]
-    101:    umlal2      v14.4s, v6.8h, v0.h[1]
-            umlal       v15.4s, v7.4h, v0.h[1]
-            umlal2      v14.4s, v7.8h, v0.h[1]
-            umlal       v15.4s, v8.4h, v0.h[1]
-
-            uqrshrn     v14.4h, v14.4s, #16
-            uqrshrn2    v14.8h, v15.4s, #16
-            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
-
-            mov         v4.16b, v5.16b
-            mov         v5.16b, v6.16b
-            mov         v6.16b, v7.16b
-            mov         v7.16b, v8.16b
-            mov         v8.16b, v9.16b
-            mov         v9.16b, v10.16b
-            mov         v10.16b, v11.16b
-.endm/*}}}*/
-
-.macro hconv4_12/*{{{*/
-.rodata
-   200:     .hword -4 //Might need to remove these...
-            .hword 101f-100f
-            .hword 102f-100f
-            .hword 103f-100f
-            .hword 104f-100f
-            .hword 105f-100f
-            .hword 106f-100f
-            .hword 107f-100f
-            .hword 108f-100f
-            .hword 109f-100f
-            .hword 110f-100f
-            .hword 111f-100f
-            .hword 112f-100f
-            .align 4
-.text
-            umull       v14.4s, v4.4h, v0.h[0]
-            umull2      v15.4s, v4.8h, v0.h[0]
-
-            adrp        x16, 200b
-            add         x16, x16, :lo12:200b
-            ldrsh       x12, [x16, x5, LSL #1]
-            adr         x16, 100f
-            add         x12, x12, x16
-    100:    br          x12
-    112:    umlal       v14.4s, v26.4h, v1.h[4]
-            umlal2      v15.4s, v26.8h, v1.h[4]
-            umlal       v14.4s, v10.4h, v1.h[4]
-            umlal2      v15.4s, v10.8h, v1.h[4]
-    111:    umlal2      v14.4s, v26.8h, v1.h[3]
-            umlal       v15.4s, v27.4h, v1.h[3]
-            umlal2      v14.4s, v9.8h, v1.h[3]
-            umlal       v15.4s, v10.4h, v1.h[3]
-    110:    umlal       v14.4s, v27.4h, v1.h[2]
-            umlal2      v15.4s, v27.8h, v1.h[2]
-            umlal       v14.4s, v9.4h, v1.h[2]
-            umlal2      v15.4s, v9.8h, v1.h[2]
-    109:    umlal2      v14.4s, v27.8h, v1.h[1]
-            umlal       v15.4s, v28.4h, v1.h[1]
-            umlal2      v14.4s, v8.8h, v1.h[1]
-            umlal       v15.4s, v9.4h, v1.h[1]
-    108:    umlal       v14.4s, v28.4h, v1.h[0]
-            umlal2      v15.4s, v28.8h, v1.h[0]
-            umlal       v14.4s, v8.4h, v1.h[0]
-            umlal2      v15.4s, v8.8h, v1.h[0]
-    107:    umlal2      v14.4s, v28.8h, v0.h[7]
-            umlal       v15.4s, v29.4h, v0.h[7]
-            umlal2      v14.4s, v7.8h, v0.h[7]
-            umlal       v15.4s, v8.4h, v0.h[7]
-    106:    umlal       v14.4s, v29.4h, v0.h[6]
-            umlal2      v15.4s, v29.8h, v0.h[6]
-            umlal       v14.4s, v7.4h, v0.h[6]
-            umlal2      v15.4s, v7.8h, v0.h[6]
-    105:    umlal2      v14.4s, v29.8h, v0.h[5]
-            umlal       v15.4s, v30.4h, v0.h[5]
-            umlal2      v14.4s, v6.8h, v0.h[5]
-            umlal       v15.4s, v7.4h, v0.h[5]
-    104:    umlal       v14.4s, v30.4h, v0.h[4]
-            umlal2      v15.4s, v30.8h, v0.h[4]
-            umlal       v14.4s, v6.4h, v0.h[4]
-            umlal2      v15.4s, v6.8h, v0.h[4]
-    103:    umlal2      v14.4s, v30.8h, v0.h[3]
-            umlal       v15.4s, v31.4h, v0.h[3]
-            umlal2      v14.4s, v5.8h, v0.h[3]
-            umlal       v15.4s, v6.4h, v0.h[3]
-    102:    umlal       v14.4s, v31.4h, v0.h[2]
-            umlal2      v15.4s, v31.8h, v0.h[2]
-            umlal       v14.4s, v5.4h, v0.h[2]
-            umlal2      v15.4s, v5.8h, v0.h[2]
-    101:    umlal2      v14.4s, v31.8h, v0.h[1]
-            umlal       v15.4s, v4.4h,  v0.h[1]
-            umlal2      v14.4s, v4.8h,  v0.h[1]
-            umlal       v15.4s, v5.4h, v0.h[1]
-
-            uqrshrn     v14.4h, v14.4s, #16
-            uqrshrn2    v14.8h, v15.4s, #16
-            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
-
-            mov         v26.16b, v27.16b
-            mov         v27.16b, v28.16b
-            mov         v28.16b, v29.16b
-            mov         v29.16b, v30.16b
-            mov         v30.16b, v31.16b
-            mov         v31.16b, v4.16b
-            mov         v4.16b, v5.16b
-            mov         v5.16b, v6.16b
-            mov         v6.16b, v7.16b
-            mov         v7.16b, v8.16b
-            mov         v8.16b, v9.16b
-            mov         v9.16b, v10.16b
-            mov         v10.16b, v11.16b
-.endm/*}}}*/
-
-.macro hconv4_20/*{{{*/
-.rodata
-   200:     .hword -4
-            .hword 101f-100f
-            .hword 102f-100f
-            .hword 103f-100f
-            .hword 104f-100f
-            .hword 105f-100f
-            .hword 106f-100f
-            .hword 107f-100f
-            .hword 108f-100f
-            .hword 109f-100f
-            .hword 110f-100f
-            .hword 111f-100f
-            .hword 112f-100f
-            .hword 113f-100f
-            .hword 114f-100f
-            .hword 115f-100f
-            .hword 116f-100f
-            .hword 117f-100f
-            .hword 118f-100f
-            .hword 119f-100f
-            .hword 120f-100f
-            .align 4
-.text
-            umull       v14.4s, v28.4h, v0.h[0]
-            umull2      v15.4s, v28.8h, v0.h[0]
-
-            adrp        x16, 200b
-            add         x16, x16, :lo12:200b
-            ldrsh       x12, [x16, x5, LSL #1]
-            adr         x16, 100f
-            add         x12, x12, x16
-    100:    br          x12
-    120:    umlal       v14.4s, v18.4h, v2.h[4]
-            umlal2      v15.4s, v18.8h, v2.h[4]
-            umlal       v14.4s, v10.4h, v2.h[4]
-            umlal2      v15.4s, v10.8h, v2.h[4]
-    119:    umlal2      v14.4s, v18.8h, v2.h[3]
-            umlal       v15.4s, v19.4h, v2.h[3]
-            umlal2      v14.4s, v9.8h,  v2.h[3]
-            umlal       v15.4s, v10.4h, v2.h[3]
-    118:    umlal       v14.4s, v19.4h, v2.h[2]
-            umlal2      v15.4s, v19.8h, v2.h[2]
-            umlal       v14.4s, v9.4h,  v2.h[2]
-            umlal2      v15.4s, v9.8h,  v2.h[2]
-    117:    umlal2      v14.4s, v19.8h, v2.h[1]
-            umlal       v15.4s, v20.4h, v2.h[1]
-            umlal2      v14.4s, v8.8h,  v2.h[1]
-            umlal       v15.4s, v9.4h,  v2.h[1]
-    116:    umlal       v14.4s, v20.4h, v2.h[0]
-            umlal2      v15.4s, v20.8h, v2.h[0]
-            umlal       v14.4s, v8.4h,  v2.h[0]
-            umlal2      v15.4s, v8.8h,  v2.h[0]
-    115:    umlal2      v14.4s, v20.8h, v1.h[7]
-            umlal       v15.4s, v21.4h, v1.h[7]
-            umlal2      v14.4s, v7.8h,  v1.h[7]
-            umlal       v15.4s, v8.4h,  v1.h[7]
-    114:    umlal       v14.4s, v21.4h, v1.h[6]
-            umlal2      v15.4s, v21.8h, v1.h[6]
-            umlal       v14.4s, v7.4h,  v1.h[6]
-            umlal2      v15.4s, v7.8h,  v1.h[6]
-    113:    umlal2      v14.4s, v21.8h, v1.h[5]
-            umlal       v15.4s, v22.4h, v1.h[5]
-            umlal2      v14.4s, v6.8h,  v1.h[5]
-            umlal       v15.4s, v7.4h,  v1.h[5]
-    112:    umlal       v14.4s, v22.4h, v1.h[4]
-            umlal2      v15.4s, v22.8h, v1.h[4]
-            umlal       v14.4s, v6.4h,  v1.h[4]
-            umlal2      v15.4s, v6.8h,  v1.h[4]
-    111:    umlal2      v14.4s, v22.8h, v1.h[3]
-            umlal       v15.4s, v23.4h, v1.h[3]
-            umlal2      v14.4s, v5.8h,  v1.h[3]
-            umlal       v15.4s, v6.4h,  v1.h[3]
-    110:    umlal       v14.4s, v23.4h, v1.h[2]
-            umlal2      v15.4s, v23.8h, v1.h[2]
-            umlal       v14.4s, v5.4h,  v1.h[2]
-            umlal2      v15.4s, v5.8h,  v1.h[2]
-    109:    umlal2      v14.4s, v23.8h, v1.h[1]
-            umlal       v15.4s, v24.4h, v1.h[1]
-            umlal2      v14.4s, v4.8h,  v1.h[1]
-            umlal       v15.4s, v5.4h,  v1.h[1]
-    108:    umlal       v14.4s, v24.4h, v1.h[0]
-            umlal2      v15.4s, v24.8h, v1.h[0]
-            umlal       v14.4s, v4.4h,  v1.h[0]
-            umlal2      v15.4s, v4.8h,  v1.h[0]
-    107:    umlal2      v14.4s, v24.8h, v0.h[7]
-            umlal       v15.4s, v25.4h, v0.h[7]
-            umlal2      v14.4s, v31.8h, v0.h[7]
-            umlal       v15.4s, v4.4h,  v0.h[7]
-    106:    umlal       v14.4s, v25.4h, v0.h[6]
-            umlal2      v15.4s, v25.8h, v0.h[6]
-            umlal       v14.4s, v31.4h, v0.h[6]
-            umlal2      v15.4s, v31.8h, v0.h[6]
-    105:    umlal2      v14.4s, v25.8h, v0.h[5]
-            umlal       v15.4s, v26.4h, v0.h[5]
-            umlal2      v14.4s, v30.8h, v0.h[5]
-            umlal       v15.4s, v31.4h, v0.h[5]
-    104:    umlal       v14.4s, v26.4h, v0.h[4]
-            umlal2      v15.4s, v26.8h, v0.h[4]
-            umlal       v14.4s, v30.4h, v0.h[4]
-            umlal2      v15.4s, v30.8h, v0.h[4]
-    103:    umlal2      v14.4s, v26.8h, v0.h[3]
-            umlal       v15.4s, v27.4h, v0.h[3]
-            umlal2      v14.4s, v29.8h, v0.h[3]
-            umlal       v15.4s, v30.4h, v0.h[3]
-    102:    umlal       v14.4s, v27.4h, v0.h[2]
-            umlal2      v15.4s, v27.8h, v0.h[2]
-            umlal       v14.4s, v29.4h, v0.h[2]
-            umlal2      v15.4s, v29.8h, v0.h[2]
-    101:    umlal2      v14.4s, v27.8h, v0.h[1]
-            umlal       v15.4s, v28.4h, v0.h[1]
-            umlal2      v14.4s, v28.8h, v0.h[1]
-            umlal       v15.4s, v29.4h, v0.h[1]
-
-            uqrshrn     v14.4h, v14.4s, #16
-            uqrshrn2    v14.8h, v15.4s, #16
-            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
-
-            mov         v18.16b, v19.16b
-            mov         v19.16b, v20.16b
-            mov         v20.16b, v21.16b
-            mov         v21.16b, v22.16b
-            mov         v22.16b, v23.16b
-            mov         v23.16b, v24.16b
-            mov         v24.16b, v25.16b
-            mov         v25.16b, v26.16b
-            mov         v26.16b, v27.16b
-            mov         v27.16b, v28.16b
-            mov         v28.16b, v29.16b
-            mov         v29.16b, v30.16b
-            mov         v30.16b, v31.16b
-            mov         v31.16b, v4.16b
-            mov         v4.16b, v5.16b
-            mov         v5.16b, v6.16b
-            mov         v6.16b, v7.16b
-            mov         v7.16b, v8.16b
-            mov         v8.16b, v9.16b
-            mov         v9.16b, v10.16b
-            mov         v10.16b, v11.16b
-.endm/*}}}*/
-
-.macro hconv4_25/*{{{*/
-.rodata
-   200:     .hword -4
-            .hword 101f-100f
-            .hword 102f-100f
-            .hword 103f-100f
-            .hword 104f-100f
-            .hword 105f-100f
-            .hword 106f-100f
-            .hword 107f-100f
-            .hword 108f-100f
-            .hword 109f-100f
-            .hword 110f-100f
-            .hword 111f-100f
-            .hword 112f-100f
-            .hword 113f-100f
-            .hword 114f-100f
-            .hword 115f-100f
-            .hword 116f-100f
-            .hword 117f-100f
-            .hword 118f-100f
-            .hword 119f-100f
-            .hword 120f-100f
-            .hword 121f-100f
-            .hword 122f-100f
-            .hword 123f-100f
-            .hword 124f-100f
-            .hword 125f-100f
-            .align 4
-.text
-            umull2      v14.4s, v25.8h, v0.h[0]
-            umull       v15.4s, v26.4h, v0.h[0]
-
-            adrp        x16, 200b
-            add         x16, x16, :lo12:200b
-            ldrsh       x12, [x16, x5, LSL #1]
-            adr         x16, 100f
-            add         x12, x12, x16
-    100:    br          x12
-    125:    ld1         {v12.8h}, [x9]
-            umlal       v14.4s, v12.4h, v3.h[1]
-            umlal2      v15.4s, v12.8h, v3.h[1]
-            umlal       v14.4s, v10.4h, v3.h[1]
-            umlal2      v15.4s, v10.8h, v3.h[1]
-    124:    add         x12, x9, #0x08
-            bic         x12, x12, #0x40
-            ld1         {v12.4h}, [x12], #8
-            bic         x12, x12, #0x40
-            ld1         {v13.4h}, [x12]
-            umlal       v14.4s, v12.4h, v3.h[0]
-            umlal       v15.4s, v13.4h, v3.h[0]
-            umlal2      v14.4s, v9.8h,  v3.h[0]
-            umlal       v15.4s, v10.4h, v3.h[0]
-    123:    add         x12, x9, #0x10
-            bic         x12, x12, #0x40
-            ld1         {v12.8h}, [x12]
-            umlal       v14.4s, v12.4h, v2.h[7]
-            umlal2      v15.4s, v12.8h, v2.h[7]
-            umlal       v14.4s, v9.4h,  v2.h[7]
-            umlal2      v15.4s, v9.8h,  v2.h[7]
-    122:    add         x12, x9, #0x18
-            bic         x12, x12, #0x40
-            ld1         {v12.4h}, [x12], #8
-            bic         x12, x12, #0x40
-            ld1         {v13.4h}, [x12]
-            umlal       v14.4s, v12.4h, v2.h[6]
-            umlal       v15.4s, v13.4h, v2.h[6]
-            umlal2      v14.4s, v8.8h,  v2.h[6]
-            umlal       v15.4s, v9.4h,  v2.h[6]
-    121:    add         x12, x9, #0x20
-            bic         x12, x12, #0x40
-            ld1         {v12.8h}, [x12]
-            umlal       v14.4s, v12.4h, v2.h[5]
-            umlal2      v15.4s, v12.8h, v2.h[5]
-            umlal       v14.4s, v8.4h,  v2.h[5]
-            umlal2      v15.4s, v8.8h,  v2.h[5]
-    120:    add         x12, x9, #0x28
-            bic         x12, x12, #0x40
-            ld1         {v12.4h}, [x12], #8
-            bic         x12, x12, #0x40
-            ld1         {v13.4h}, [x12]
-            umlal       v14.4s, v12.4h, v2.h[4]
-            umlal       v15.4s, v13.4h, v2.h[4]
-            umlal2      v14.4s, v7.8h,  v2.h[4]
-            umlal       v15.4s, v8.4h,  v2.h[4]
-    119:    add         x12, x9, #0x30
-            bic         x12, x12, #0x40
-            ld1         {v12.8h}, [x12]
-            umlal       v14.4s, v12.4h, v2.h[3]
-            umlal2      v15.4s, v12.8h, v2.h[3]
-            umlal       v14.4s, v7.4h,  v2.h[3]
-            umlal2      v15.4s, v7.8h,  v2.h[3]
-    118:    add         x12, x9, #0x38
-            bic         x12, x12, #0x40
-            ld1         {v12.4h}, [x12]
-            umlal       v14.4s, v12.4h, v2.h[2]
-            umlal       v15.4s, v17.4h, v2.h[2]
-            umlal2      v14.4s, v6.8h,  v2.h[2]
-            umlal       v15.4s, v7.4h,  v2.h[2]
-    117:    umlal       v14.4s, v17.4h, v2.h[1]
-            umlal2      v15.4s, v17.8h, v2.h[1]
-            umlal       v14.4s, v6.4h,  v2.h[1]
-            umlal2      v15.4s, v6.8h,  v2.h[1]
-    116:    umlal2      v14.4s, v17.8h, v2.h[0]
-            umlal       v15.4s, v18.4h, v2.h[0]
-            umlal2      v14.4s, v5.8h,  v2.h[0]
-            umlal       v15.4s, v6.4h,  v2.h[0]
-    115:    umlal       v14.4s, v18.4h, v1.h[7]
-            umlal2      v15.4s, v18.8h, v1.h[7]
-            umlal       v14.4s, v5.4h,  v1.h[7]
-            umlal2      v15.4s, v5.8h,  v1.h[7]
-    114:    umlal2      v14.4s, v18.8h, v1.h[6]
-            umlal       v15.4s, v19.4h, v1.h[6]
-            umlal2      v14.4s, v4.8h,  v1.h[6]
-            umlal       v15.4s, v5.4h,  v1.h[6]
-    113:    umlal       v14.4s, v19.4h, v1.h[5]
-            umlal2      v15.4s, v19.8h, v1.h[5]
-            umlal       v14.4s, v4.4h,  v1.h[5]
-            umlal2      v15.4s, v4.8h,  v1.h[5]
-    112:    umlal2      v14.4s, v19.8h, v1.h[4]
-            umlal       v15.4s, v20.4h, v1.h[4]
-            umlal2      v14.4s, v31.8h, v1.h[4]
-            umlal       v15.4s, v4.4h,  v1.h[4]
-    111:    umlal       v14.4s, v20.4h, v1.h[3]
-            umlal2      v15.4s, v20.8h, v1.h[3]
-            umlal       v14.4s, v31.4h, v1.h[3]
-            umlal2      v15.4s, v31.8h, v1.h[3]
-    110:    umlal2      v14.4s, v20.8h, v1.h[2]
-            umlal       v15.4s, v21.4h, v1.h[2]
-            umlal2      v14.4s, v30.8h, v1.h[2]
-            umlal       v15.4s, v31.4h, v1.h[2]
-    109:    umlal       v14.4s, v21.4h, v1.h[1]
-            umlal2      v15.4s, v21.8h, v1.h[1]
-            umlal       v14.4s, v30.4h, v1.h[1]
-            umlal2      v15.4s, v30.8h, v1.h[1]
-    108:    umlal2      v14.4s, v21.8h, v1.h[0]
-            umlal       v15.4s, v22.4h, v1.h[0]
-            umlal2      v14.4s, v29.8h, v1.h[0]
-            umlal       v15.4s, v30.4h, v1.h[0]
-    107:    umlal       v14.4s, v22.4h, v0.h[7]
-            umlal2      v15.4s, v22.8h, v0.h[7]
-            umlal       v14.4s, v29.4h, v0.h[7]
-            umlal2      v15.4s, v29.8h, v0.h[7]
-    106:    umlal2      v14.4s, v22.8h, v0.h[6]
-            umlal       v15.4s, v23.4h, v0.h[6]
-            umlal2      v14.4s, v28.8h, v0.h[6]
-            umlal       v15.4s, v29.4h, v0.h[6]
-    105:    umlal       v14.4s, v23.4h, v0.h[5]
-            umlal2      v15.4s, v23.8h, v0.h[5]
-            umlal       v14.4s, v28.4h, v0.h[5]
-            umlal2      v15.4s, v28.8h, v0.h[5]
-    104:    umlal2      v14.4s, v23.8h, v0.h[4]
-            umlal       v15.4s, v24.4h, v0.h[4]
-            umlal2      v14.4s, v27.8h, v0.h[4]
-            umlal       v15.4s, v28.4h, v0.h[4]
-    103:    umlal       v14.4s, v24.4h, v0.h[3]
-            umlal2      v15.4s, v24.8h, v0.h[3]
-            umlal       v14.4s, v27.4h, v0.h[3]
-            umlal2      v15.4s, v27.8h, v0.h[3]
-    102:    umlal2      v14.4s, v24.8h, v0.h[2]
-            umlal       v15.4s, v25.4h, v0.h[2]
-            umlal2      v14.4s, v26.8h, v0.h[2]
-            umlal       v15.4s, v27.4h, v0.h[2]
-    101:    umlal       v14.4s, v25.4h, v0.h[1]
-            umlal2      v15.4s, v25.8h, v0.h[1]
-            umlal       v14.4s, v26.4h, v0.h[1]
-            umlal2      v15.4s, v26.8h, v0.h[1]
-
-            uqrshrn     v14.4h, v14.4s, #16
-            uqrshrn2    v14.8h, v15.4s, #16
-            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
-
-            st1         {v17.16b}, [x9], #16
-            bic         x9, x9, #0x40
-            mov         v17.16b, v18.16b
-            mov         v18.16b, v19.16b
-            mov         v19.16b, v20.16b
-            mov         v20.16b, v21.16b
-            mov         v21.16b, v22.16b
-            mov         v22.16b, v23.16b
-            mov         v23.16b, v24.16b
-            mov         v24.16b, v25.16b
-            mov         v25.16b, v26.16b
-            mov         v26.16b, v27.16b
-            mov         v27.16b, v28.16b
-            mov         v28.16b, v29.16b
-            mov         v29.16b, v30.16b
-            mov         v30.16b, v31.16b
-            mov         v31.16b, v4.16b
-            mov         v4.16b, v5.16b
-            mov         v5.16b, v6.16b
-            mov         v6.16b, v7.16b
-            mov         v7.16b, v8.16b
-            mov         v8.16b, v9.16b
-            mov         v9.16b, v10.16b
-            mov         v10.16b, v11.16b
-.endm/*}}}*/
-
-/* Dedicated function wrapper for the fetch macro, for the cases where
- * performance isn't that important, to keep code size down.
- */
-PRIVATE(fetch_generic_asm)
-            stp         x10, x11, [sp, #-16]!
-            fetch
-            ldp         x10, x11, [sp], #16
-            ret
-END(fetch_generic_asm)
-
-
-/* Fetch the next (16 - (x10 & 15)) columns of data, avoiding reading memory
- * beyond that limit, and filling the rest of the vector with the last legal
- * pixel.
- * Result is in v10 and v11.  v8 and v9 are filled with the first legal pixel.
- * Note: This function can read beyond the right edge of input if the image is
- * narrower than 16 bytes.
- */
-PRIVATE(fetch_clampleft1)
-            stp         x29, x30, [sp, #-16]!
-            bl          fetch_generic_asm
-            dup         v8.8h, v10.h[0]
-            dup         v9.8h, v10.h[0]
-            ands        x12, x10, #15
-            beq         1f
-            sub         x1, x1, x12
-            sub         x15, x15, x12
-            sub         x19, x19, x12
-            sub         x10, x10, x12
-            sub         x12, sp, x12, LSL #1
-            sub         sp, sp, #64
-            sub         x12, x12, #32
-            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
-            ld1         {v10.8h,v11.8h}, [x12]
-            add         sp, sp, #64
-1:          ldp         x29, x30, [sp], #16
-            ret
-END(fetch_clampleft1)
-
-PRIVATE(fetch_clampleft4)
-            stp         x29, x30, [sp, #-16]!
-            bl          fetch_generic_asm
-            dup         v8.2d, v10.d[0]
-            dup         v9.2d, v10.d[0]
-            ands        x12, x10, #15
-            beq         1f
-            sub         x1, x1, x12
-            sub         x15, x15, x12
-            sub         x19, x19, x12
-            sub         x10, x10, x12
-            sub         x12, sp, x12, LSL #1
-            sub         sp, sp, #64
-            sub         x12, x12, #32
-            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
-            ld1         {v10.8h,v11.8h}, [x12]
-            add         sp, sp, #64
-1:          ldp         x29, x30, [sp], #16
-            ret
-END(fetch_clampleft4)
-
-/* Fetch only the next (x11 & 15) (where 0 means 16) columns of data, avoiding
- * reading memory beyond that limit, and filling the rest of the vector with
- * the last legal pixel.
- * Result is in v10 and v11.  v12 and v13 are filled with the last legal pixel.
- * Note: This function can read beyond the left edge of input if the image is
- * narrower than 16 bytes.
- */
-PRIVATE(fetch_clampright1)
-            stp         x29, x30, [sp, #-16]!
-            sub         x12, xzr, x11
-            ands        x12, x12, #15
-            beq         1f
-            sub         x1, x1, x12
-            sub         x15, x15, x12
-            sub         x19, x19, x12
-            bl          fetch_generic_asm
-            dup         v12.8h, v11.h[7]
-            dup         v13.8h, v11.h[7]
-            sub         x12, xzr, x11
-            and         x12, x12, #15
-            sub         sp, sp, #64
-            add         x12, sp, x12, LSL #1
-            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
-            ld1         {v10.8h,v11.8h}, [x12]
-            add         sp, sp, #64
-            ldp         x29, x30, [sp], #16
-            ret
-1:          bl          fetch_generic_asm
-            dup         v12.8h, v11.h[7]
-            dup         v13.8h, v11.h[7]
-            ldp         x29, x30, [sp], #16
-            ret
-END(fetch_clampright1)
-
-PRIVATE(fetch_clampright4)
-            stp         x29, x30, [sp, #-16]!
-            sub         x12, xzr, x11
-            ands        x12, x12, #15
-            beq         1f
-            sub         x1, x1, x12
-            sub         x15, x15, x12
-            sub         x19, x19, x12
-            bl          fetch_generic_asm
-            dup         v12.2d, v11.d[1]
-            dup         v13.2d, v11.d[1]
-            sub         x12, xzr, x11
-            and         x12, x12, #15
-            sub         sp, sp, #64
-            add         x12, sp, x12, LSL #1
-            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
-            ld1         {v10.8h,v11.8h}, [x12]
-            add         sp, sp, #64
-            ldp         x29, x30, [sp], #16
-            ret
-1:          bl          fetch_generic_asm
-            dup         v12.2d, v11.d[1]
-            dup         v13.2d, v11.d[1]
-            ldp         x29, x30, [sp], #16
-            ret
-END(fetch_clampright4)
-
-/* Given values in v10 and v11, and an index in x11, sweep the (x11 & 15)th
- * value across to fill the rest of the register pair.  Used for filling the
- * right hand edge of the window when reading too close to the right hand edge
- * of the image.
- * Also returns a dup-ed copy of the last element in v12 for the tail-fill
- * case (this happens incidentally in common path, but must be done
- * deliberately in the fast-out path).
- */
-PRIVATE(prefill_sweepright1)
-            ands        x12, x11, #15
-            beq         1f
-            sub         x12, x12, #1
-            sub         sp, sp, #64
-            st1         {v10.8h,v11.8h}, [sp]
-            add         x12, sp, x12, LSL #1
-            ld1r        {v12.8h}, [x12]
-            ld1r        {v13.8h}, [x12]
-            st1         {v12.8h,v13.8h}, [x12]
-            ld1         {v10.8h,v11.8h}, [sp]
-            add         sp, sp, #64
-            ret
-1:          dup         v12.8h, v11.h[7]
-            dup         v13.8h, v11.h[7]
-            ret
-END(prefill_sweepright1)
-
-PRIVATE(prefill_sweepright4)
-            ands        x12, x11, #15
-            beq         1f
-            sub         x12, x12, #4
-            sub         sp, sp, #64
-            st1         {v10.8h,v11.8h}, [sp]
-            add         x12, sp, x12, LSL #1
-            ld1r        {v12.2d}, [x12]
-            st1         {v13.8h}, [x12]
-            ld1         {v10.8h,v11.8h}, [sp]
-            add         sp, sp, #64
-            ret
-1:          dup         v12.2d, v11.d[1]
-            dup         v13.2d, v11.d[1]
-            ret
-END(prefill_sweepright4)
-
-/* The main loop keeps a sliding window of data that has already been convolved
- * in the vertical axis for the current line.  This usually stays in the
- * register file, but spills to memory for large windows.  The first thing that
- * needs to be done at start-up is to fill this window with image data, taking
- * into account the padding needed if the left or right edges of the image fall
- * within this window.
- */
-
-/* Because the window is in the register file writes to it cannot be indexed
- * by another register.  Consequently the fill loops are unrolled to address
- * the registers directly.  This macro distinguishes between writes to the
- * register file and writes to the spill buffer (indicated by a destination
- * register named xx).
- */
-.macro prefill_out ra, rb, sra, srb
-  .ifc \ra,xx
-    .ifc \rb,xx
-            st1         {\sra,\srb}, [x9], #32
-    .else
-            bic         x9, x9, #0x40
-            st1         {\sra}, [x9], #16
-            mov         \rb, \srb
-    .endif
-  .else
-    .ifnc \ra,\sra
-            mov         \ra, \sra
-    .endif
-    .ifnc \rb,\srb
-            mov         \rb, \srb
-    .endif
-  .endif
-.endm
-
-/* This macro provides the list of registers representing the window, and the
- * cases where the register file is too small and a spill buffer is used
- * instead.
- * Since several specialisations of each function are generated, this also
- * culls superfluous iterations, and sets the variable `i` for subsequent
- * macros indicating the current index into the window.
- */
-.macro prefill_list, macro, nextmacro, max_r, step, label
-  .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
-    .if windowsize >= (\line * 16)
-      .set i, windowsize - (\line * 16)
-\label\macro\line:
-            prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
-    .endif
-  .endm
-            ifneeded \macro \nextmacro, 13, 12, xx,      xx,      \step, \label
-            ifneeded \macro \nextmacro, 12, 11, xx,      xx,      \step, \label
-            ifneeded \macro \nextmacro, 11, 10, xx,      v17.16b, \step, \label
-            ifneeded \macro \nextmacro, 10,  9, v18.16b, v19.16b, \step, \label
-            ifneeded \macro \nextmacro,  9,  8, v20.16b, v21.16b, \step, \label
-            ifneeded \macro \nextmacro,  8,  7, v22.16b, v23.16b, \step, \label
-            ifneeded \macro \nextmacro,  7,  6, v24.16b, v25.16b, \step, \label
-            ifneeded \macro \nextmacro,  6,  5, v26.16b, v27.16b, \step, \label
-            ifneeded \macro \nextmacro,  5,  4, v28.16b, v29.16b, \step, \label
-            ifneeded \macro \nextmacro,  4,  3, v30.16b, v31.16b, \step, \label
-            ifneeded \macro \nextmacro,  3,  2, v4.16b,  v5.16b,  \step, \label
-            ifneeded \macro \nextmacro,  2,  1, v6.16b,  v7.16b,  \step, \label
-            ifneeded \macro \nextmacro,  1,  0, v8.16b,  v9.16b,  \step, \label
-\label\macro\()0:
-            b           \label\()_end
-  .purgem ifneeded
-.endm
-
-/* These macros represent the possible stages of filling the window.
- * Each macro is unrolled enough times that it can fill the entire window
- * itself, but normally it will have to hand control to subsequent macros
- * part-way through and this is done using labels named \next and \after, where
- * \next is the next macro starting at the same window position and \after is
- * the next macro starting after the current window position.
- */
-
-/* leftfill: v8 and v9 contain the left padding value.  While the window
- * extends outside of the image on the left-hand side, and at least 16 more
- * padding values are needed in the window, store v8 and v9 into the window.
- * Otherwise skip forward to storing image data.
- */
-.macro prefill_leftfill, next, after, ra, rb, step
-            cmp         x10, #i+16
-            blo         \next
-            prefill_out \ra, \rb, v8.16b, v9.16b
-.endm
-
-/* leftedge: The very first non-fill or partial-fill chunk from the image is
- * already loaded (as it was used to calculate the left padding value), so
- * store it here, and then drop into the regular load/store cycle in the next
- * macro.
- */
-.macro prefill_leftedge, next, after, ra, rb, step
-1:          prefill_out \ra, \rb, v10.16b, v11.16b
-            b           \after
-.endm
-
-/* dofetch: Copy chunks of the image into the window without any complications
- * from edge conditions.
- */
-.macro prefill_dofetch, next, after, ra, rb, step
-            cmp         x11, #i+16
-            bls         \next
-            bl          fetch_generic_asm
-            prefill_out \ra, \rb, v10.16b, v11.16b
-.endm
-
-/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
- * the right-hand edge of the image.  In that case sweep the last valid pixel
- * across the rest of the chunk, and in either case prepare padding data in v12
- * and v13 for the next macro.  This is done in fetch_clampright.
- * This only happens once before going on to the next macro.
- * Sometimes leftedge also covers the rightedge case, in which case this has
- * to be skipped altogether.
- */
-.macro prefill_rightedge, next, after, ra, rb, step
-            cmp         x11, #i
-            bls         \next
-            bl          fetch_clampright\step
-            prefill_out \ra, \rb, v10.16b, v11.16b
-            b           \after
-.endm
-
-/* rightfill: The rest of the window is simply filled with right padding from
- * v12 and v13.
- */
-.macro prefill_rightfill, next, after, ra, rb, step
-            prefill_out \ra, \rb, v12.16b, v13.16b
-.endm
-
-/* Here all of the macros above are unrolled and laid out in the proper order.
- */
-.macro prefill_body, max_r, step, label
-            prefill_list leftfill,  leftedge,   \max_r, \step, \label
-            prefill_list leftedge,  dofetch,    \max_r, \step, \label
-            prefill_list dofetch,   rightedge,  \max_r, \step, \label
-            prefill_list rightedge, rightfill,  \max_r, \step, \label
-            prefill_list rightfill, oops,       \max_r, \step, \label
-\label\()_end:
-.endm
-
-
-/* Fill the convolution window with context data.  The aim here is to load
- * exactly 2*r columns, and in the main loop to read as many columns as will be
- * written.  This is complicated by the window being divided into chunks at
- * register boundaries, and the need to handle cases when the input starts very
- * close to the left or right (or both) edges of the image and the need to fill
- * the spaces that leaves with left and right edge padding values.
- *
- * Input:
- *      x1 -- src
- *      x2 -- pitch
- *      x3 -- count
- *      x4 -- available image data right of src pointer
- *      x5 -- r
- *      x6 -- rup
- *      x7 -- rdn
- *      x8 -- available image data left of src pointer
- *      x9 -- buffer (if needed)
- *      x13 = -pitch
- *      x15 = top-row in
- *      x19 = bottom-row in
- * Output:
- *      x4 -= min(inlen, count + windowsize - centertap)
- *      x1 += min(inlen, count + windowsize - centertap)
- *      x15 += min(inlen, count + windowsize - centertap)
- *      x19 += min(inlen, count + windowsize - centertap)
- * Modifies:
- *      x10 -- fill start index in the window
- *      x11 -- fill stop index in the window
- *      x12 -- scratch
- */
-.macro prefill step=1, max_r=25, label=xx
-.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
-.set centertap, (windowsize - \max_r * \step)
-            mov         x10, #centertap
-            subs        x10, x10, x8
-            csel        x10, xzr, x10, lo
-
-            subs        x11, x4, #windowsize - centertap
-            csel        x11, xzr, x11, hs
-            add         x11, x11, #windowsize
-
-            /* x10 indicates where in the window legal image data begins.
-             * x11 indicates where in the window legal image date ends.
-             * When starting near the centre of a large image these would be
-             * zero and windowsize respectively, but when starting near the
-             * edges this can change.
-             * When starting on the leftmost pixel, x10 will be centertap.
-             * When starting on the rightmost pixel, x11 will be centertap+1.
-             */
-
-            /* x4 indicates how much data there is between the current pointers
-             * and the right edge of the image.  The pointers currently point
-             * to the data needed at centertap.  The subsequent code will
-             * consume (windowsize - x10) data, but only the data from
-             * centertap to windowsize comes out of x4's budget.
-             */
-1:          subs        x4, x4, #windowsize - centertap
-            csel        x4, xzr, x4, lo
-
-            /* And the pointers need to rewind to the start of the window.
-             */
-            sub         x1, x1, #centertap
-            sub         x15, x15, #centertap
-            sub         x19, x19, #centertap
-
-            /* Unless x8 indicated that there wasn't that much data available.
-             */
-            add         x1, x1, x10
-            add         x15, x15, x10
-            add         x19, x19, x10
-
-            /* Get the first chunk, and add padding to align it to the window
-             * if necessary.
-             */
-            bl          fetch_clampleft\step
-
-            /* Sometimes the start and the end of the window are in the same
-             * chunk.  In that case both ends need filler at the outset.
-             */
-            sub         x12, x11, #1
-            eor         x12,  x10, x12
-            cmp         x12, #16
-            bhs         1f
-            bl          prefill_sweepright\step
-
-            /* Iterate through all the points in the window and fill them in
-             * with padding or image data as needed.
-             */
-1:          prefill_body \max_r, \step, \label
-.endm
-
-/* The main body of the convolve functions.  Having already pre-filled the
- * convolution window with 2*r input values, the logic settles into a regular
- * pattern of reading and writing at a 1:1 rate until either input or output
- * expires.  The input leads the output by r values, so when processing all the
- * way to the right-hand edge, or within r pixels of that edge, the input will
- * run out first.  In the case of very narrow images, or sub-windows starting
- * near the right edge, the input may already have run out while the
- * convolution window was being filled and this loop will start with a
- * zero-length input.
- *
- * Once the input runs out, the rest of the output must be processed by padding
- * the remainder of the window with pad value from the last valid pixel from
- * the source.
- *
- * Input:
- *      x0 = dst
- *      x1 = src
- *      x2 = pitch
- *      x3 = count
- *      x4 = inlen
- *      x5 = r
- *      x6 = rup
- *      x7 = rdn
- *      x9 = buffer
- *      x13 = -pitch
- *      x15 = top-row in
- *      x19 = bottom-row in
- * Modifies
- *      x8 = fetch code pointer
- */
-.macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
-
-            /* If x4 >= x3 then there's no need for clipping.  The main loop
-             * needs to exit when either x3 or x4 runs out, so clamp x4 to be
-             * no greater than x3 and use x4 for the loop.
-             * However, if x4 comes out of the loop with less than 16 bytes
-             * left, a partial read would be necessary to avoid reading beyond
-             * the end of the image.  To avoid this, clamp x4 to the next
-             * multiple of 16, which is still sufficient to force it out of the
-             * loop but doesn't imply a rewind.
-             */
-            add         x12, x3, #15
-            bic         x12, x12, #15
-            cmp         x4, x12
-            csel        x4, x12, x4, hi
-
-            /* First calculate the entry-point into the internal fetch logic.
-             * This is done so the same function can service several kernel
-             * sizes.
-             */
-            adrp        x8, \labelnc
-            add         x8, x8, #:lo12:\labelnc
-            sub         x8, x8, x5, LSL #5
-            sub         x8, x8, x5, LSL #3
-            cmp         x5, x6
-            ccmp        x5, x7, #0, eq
-            beq         5f
-
-            /* if (r != rup || r != rdn) then the address-clamping table should
-             * be used rather than the short-cut version.
-             */
-            adrp        x8, \labelc
-            add         x8, x8, #:lo12:\labelc
-            sub         x8, x8, x5, LSL #6
-            add         x8, x8, x5, LSL #3
-            b           5f
-
-            /* Main loop: ... */
-            .align  4
-3:          /* first perform a vertical convolution from memory to get the next
-             * 16 taps of the horizontal window into the register file...
-             */
-            fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
-
-            /* ...then perform a horizontal convolution on that window to
-             * produce eight output bytes, and slide the window along.
-             * This has to be done twice to match the 16-way vertical pass.
-             * It would be preferable to have twice the work done in \core, but
-             * that would demand yet another variant on those macros and would
-             * perturb the register allocation severely.
-             */
-            \core
-            st1         {v15.8b}, [x0], #8
-            \core
-            st1         {v15.8b}, [x0], #8
-
-            sub         x3, x3, #16
-5:          subs        x4, x4, #16
-            bhi         3b
-            /* Here there's 16 or fewer bytes available before the edge of the
-             * source image.  x4 holds that count minus 16 (because it was
-             * decremented before the first iteration ran).  The last read may
-             * not be a whole chunk, and beyond that a fill value must be used.
-             *
-             * Of course, none of that matters if there's no more output to
-             * produce...
-             */
-            cbz         x3, 5f
-
-            /* Oh well. */
-            adds        x4, x4, #16
-            bne         1f
-  .if \step==1
-            dup         v10.8h, v9.h[7]
-            dup         v11.8h, v9.h[7]
-  .else
-            dup         v10.2d, v9.d[1]
-            dup         v11.2d, v9.d[1]
-  .endif
-            b           3f
-
-            /* To avoid reading past end of input, rewind pointers by (16-x4)
-             * to ensure that they're exactly 16 bytes from the edge.
-             */
-1:          mov         x11, x4
-            bl          fetch_clampright\step
-            /* Now to put this padding to use, perform any remaining
-             * iterations.  This is done at half the rate of the main loop,
-             * because there's no longer pressure from a 16-lane window filler.
-             */
-3:          \core
-  .if \step==1
-            dup         v11.8h, v11.h[7]
-  .else
-            dup         v11.2d, v11.d[1]
-  .endif
-            subs        x3, x3, #8
-            blo         4f
-            st1         {v15.8b}, [x0], #8
-            bne         3b
-            b           5f
-
-            /* If the final iteration contained 0 < l < 8 values, then perform
-             * a piecewise store of the final vector.
-             */
-4:          tbz         x3, #2, 1f
-            st1         {v15.s}[0], [x0], #4
-            ext         v15.8b, v15.8b, v15.8b, #4
-1:          tbz         x3, #1, 1f
-            st1         {v15.h}[0], [x0], #2
-            ext         v15.8b, v15.8b, v15.8b, #2
-1:          tbz         x3, #0, 5f
-            st1         {v15.b}[0], [x0], #1
-            ext         v15.8b, v15.8b, v15.8b, #1
-5:          mov         x0, #0
-.endm
-
-
-.irp r, TUNED_LIST1, 25
-PRIVATE(convolve1_\r)
-            stp         x29,x30, [sp, #-16]!
-
-            prefill     step=1, max_r=\r, label=.Lcnv1_\r
-
-            conv_body   core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
-
-            ldp         x29,x30, [sp], #16
-            ret
-END(convolve1_\r)
-.endr
-
-.irp r, TUNED_LIST4, 25
-PRIVATE(convolve4_\r)
-            sub         x9, sp, #0x40
-            stp         x29,x30, [sp, #-(16 + 0x40 + 0x80)]!
-            bic         x9, x9, #0x7f
-
-            /* x9 now points to a 0x40 byte buffer on the stack whose address
-             * has the low 7 bits clear.  This allows easy address calculation
-             * in the wrap-around cases.
-             */
-
-            prefill     step=4, max_r=\r, label=.Lcnv4_\r
-
-            conv_body   core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
-
-            ldp         x29,x30, [sp], #(16 + 0x40 + 0x80)
-            ret
-END(convolve4_\r)
-.endr
-
-/* void rsdIntrinsicBlurU1_K(
- *                  void *out,      // x0
- *                  void *in,       // x1
- *                  size_t w,       // x2
- *                  size_t h,       // x3
- *                  size_t p,       // x4
- *                  size_t x,       // x5
- *                  size_t y,       // x6
- *                  size_t count,   // x7
- *                  size_t r,       // [sp]
- *                  uint16_t *tab); // [sp,#8]
- */
-ENTRY(rsdIntrinsicBlurU1_K)
-            stp         x19,x30, [sp, #-16]!
-            sub         x8, sp, #32
-            sub         sp, sp, #64
-            st1         {v8.1d - v11.1d}, [sp]
-            st1         {v12.1d - v15.1d}, [x8]
-            mov         x8, x5          // x
-            ldr         w5, [sp,#80]    // r
-            sub         x9, x2, x8      // w - x
-            sub         x10, x3, x6     // h - y
-            mov         x2, x4          // pitch
-            mov         x3, x7          // count
-            sub         x7, x10, #1     // h - y - 1
-            mov         x4, x9          // inlen = (w - x)
-
-            ldr         x12, [sp, #88] // tab
-
-            add         x1, x1, x8      // src += x
-
-            cmp         x6, x5
-            csel        x6, x5, x6, hs  // rup = min(r, y)
-            cmp         x7, x5
-            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
-
-            sub         x13, xzr, x2    // -pitch
-            msub        x15, x2, x6, x1
-            madd        x19, x2, x7, x1
-
-            ld1         {v0.8h,v1.8h}, [x12], #32
-            ld1         {v2.8h,v3.8h}, [x12], #32
-
-            adr         x30, 1f
-  .irp r, TUNED_LIST1
-            cmp         x5, #\r
-            bls         convolve1_\r
-  .endr
-            b           convolve1_25
-
-1:          ld1         {v8.1d - v11.1d}, [sp], #32
-            ld1         {v12.1d - v15.1d}, [sp], #32
-            ldp         x19,x30, [sp], #16
-            ret
-END(rsdIntrinsicBlurU1_K)
-
-/* void rsdIntrinsicBlurU4_K(
- *                  void *out,      // x0
- *                  void *in,       // x1
- *                  size_t w,       // x2
- *                  size_t h,       // x3
- *                  size_t p,       // x4
- *                  size_t x,       // x5
- *                  size_t y,       // x6
- *                  size_t count,   // x7
- *                  size_t r,       // [sp]
- *                  uint16_t *tab); // [sp,#8]
- */
-ENTRY(rsdIntrinsicBlurU4_K)
-            stp         x19,x30, [sp, #-16]!
-            sub         x8, sp, #32
-            sub         sp, sp, #64
-            st1         {v8.1d - v11.1d}, [sp]
-            st1         {v12.1d - v15.1d}, [x8]
-            lsl         x8, x5, #2      // x
-            lsl         x2, x2, #2
-            ldr         w5, [sp,#80]    // r
-            sub         x9, x2, x8      // w - x
-            sub         x10, x3, x6     // h - y
-            mov         x2, x4          // pitch
-            lsl         x3, x7, #2      // count
-            sub         x7, x10, #1     // h - y - 1
-            mov         x4, x9          // inlen = (w - x)
-
-            ldr         x12, [sp, #88]
-
-            add         x1, x1, x8      // in += x
-
-            cmp         x6, x5
-            csel        x6, x5, x6, hs  // rup = min(r, y)
-            cmp         x7, x5
-            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
-
-
-            sub         x13, xzr, x2
-            msub        x15, x2, x6, x1
-            madd        x19, x2, x7, x1
-
-            ld1         {v0.8h,v1.8h}, [x12], #32
-            ld1         {v2.8h,v3.8h}, [x12], #32
-
-            adr         x30, 1f
-  .irp r, TUNED_LIST4
-            cmp         x5, #\r
-            bls         convolve4_\r
-  .endr
-            b           convolve4_25
-
-1:          ld1         {v8.1d - v11.1d}, [sp], #32
-            ld1         {v12.1d - v15.1d}, [sp], #32
-            ldp         x19,x30, [sp], #16
-            ret
-END(rsdIntrinsicBlurU4_K)

diff --git a/toolkit/Blur_neon.S b/toolkit/Blur_neon.S
deleted file mode 100644
index 241af5f..0000000
--- a/toolkit/Blur_neon.S
+++ /dev/null

@@ -1,1824 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
-#define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart
-#define END(f) .fnend; .size f, .-f;
-
-#define ARCH_ARM_USE_BLUR_PRELOAD
-
-.eabi_attribute 25,1 @Tag_ABI_align8_preserved
-.arm
-
-/* Number of fractional bits to preserve in intermediate results.  The
- * intermediate storage is 16-bit, and we started with 8 bit data (the integer
- * part), so this should be between 0 and 8.
- */
-.set FRACTION_BITS, 7
-
-.set MAX_R, 25
-
-
-/* A quick way of making a line of code conditional on some other condition.
- * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
- * `ifcc`:
- */
-.macro ifcc zzz:vararg
-.if cc
-            \zzz
-.endif
-.endm
-
-/* It's not always clear that prefetching is beneficial and this needs further
- * testing on different cores, so it's made switchable here.
- */
-#if defined(ARCH_ARM_USE_BLUR_PRELOAD)
-#define VERTPLD(...) pld [__VA_ARGS__]
-#else
-#define VERTPLD(...) nop
-#endif
-
-/* Fetch 16 columns of bytes (regardless of image format), convolve these
- * vertically, and leave them in the register file.  If working near the top or
- * bottom of an image then clamp the addressing while loading the data in.
- *
- * The convolution is fully unrolled for windows up to max_r, with the
- * outermost edges calculated first.  This way it's possible to branch directly
- * into the relevant part of the code for an arbitrary convolution radius.  Two
- * variants of the loop are produced; one eliminates the clamping code for a
- * slight speed advantage.
- *
- * Where the macro is called with reg=x, the specified register is taken to
- * contain a pre-calculated pointer into one of the two loops.
- *
- * Input:
- *      r1 -- src
- *      r2 -- pitch
- *      r5 -- r
- *      r6 -- rup (r, unless clipped to top of source image)
- *      r7 -- rdn (r, unless clipped to bottom of source image)
- *      r12 -- switch index
- *      q0-q3 -- coefficient table
- * Output:
- *      r1 += 16
- *      q10,q11 -- 16 convolved columns
- * Modifies:
- *      r10 = upper row pointer
- *      r11 = lower row pointer
- *      q12-q15 = temporary sums
- */
-.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=r12 /*{{{*/
-  .ifc \reg,r12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
-
-            vld1.8      {d30,d31}, [r1]
-            mls         r10, r2, r6, r1
-
-            vmovl.u8    q14, d30
-            VERTPLD(r1, #32)
-            vmovl.u8    q15, d31
-  .if \max_r < 16 // approximate
-    ifcc    adr         \reg, 1f
-  .else
-    ifcc    ldr         \reg, 2f
-1:  ifcc    add         \reg, \reg, pc
-  .endif
-
-            vmull.u16   q12, d28, d0[0]
-    ifcc    sub         \reg, r5, LSL #6
-            vmull.u16   q13, d29, d0[0]
-            mla         r11, r2, r7, r1
-            vmull.u16   q14, d30, d0[0]
-            add         r1, r1, #16
-            vmull.u16   q15, d31, d0[0]
-            bx          \reg
-
-     ifcc   .align 2
-  2: ifcc   .word       1f-1b-8
-
-  /* This version of the vertical fetch loop body is used away from the edges
-   * of the source image.  The pointers start at the top and bottom source rows
-   * and work their way towards the centre on each iteration.  This way the
-   * number of taps used can be controlled by jumping directly into the middle
-   * of the loop and running to completion.
-   * If the loop body changes size then the code which calculates the address of
-   * the initial iteration must be updated to accordingly.
-   */
-  .macro vertfetch_noclamp i, dreg
-    .if 0 < \i && \i <= \max_r
-            vld1.8      {d20,d21}, [r10], r2
-            vld1.8      {d22,d23}, [r11]
-            sub         r11, r11, r2
-            vswp        d21, d22
-            VERTPLD(r10, #32)
-            vaddl.u8    q10, d20, d21
-            vaddl.u8    q11, d22, d23
-            vmlal.u16   q12, d20, \dreg
-            VERTPLD(r11, #32)
-            vmlal.u16   q13, d21, \dreg
-            vmlal.u16   q14, d22, \dreg
-            vmlal.u16   q15, d23, \dreg
-    .endif
-  .endm
-
-  /* This version of the vertical fetch loop body is used near the edges of the
-   * source image, where one or both of the accesses may start with a clamped
-   * value, and the row addresses only begin to change after some number of
-   * iterations before the end.
-   * If the loop body changes size then the code which calculates the address of
-   * the initial iteration must be updated to accordingly.
-   */
-  .macro vertfetch_clamped i, dreg
-    .if 0 < \i && \i <= \max_r
-            vld1.8      {d20,d21}, [r10]
-            vld1.8      {d22,d23}, [r11]
-            cmp         r6, #\i
-            vswp        d21, d22
-            VERTPLD(r10, #32)
-            vaddl.u8    q10, d20, d21
-            addhs       r10, r10, r2
-            vaddl.u8    q11, d22, d23
-            cmp         r7, #\i
-            vmlal.u16   q12, d20, \dreg
-            VERTPLD(r11, #32)
-            vmlal.u16   q13, d21, \dreg
-            subhs       r11, r11, r2
-            vmlal.u16   q14, d22, \dreg
-            nop
-            vmlal.u16   q15, d23, \dreg
-    .endif
-  .endm
-
-  /* Entry into this unrolled loop is computed as a negative index from
-   * \labelc at the end of the block.
-   */
-  .align 4
-  vertfetch_clamped 27, d6[3]
-  vertfetch_clamped 26, d6[2]
-  vertfetch_clamped 25, d6[1]
-  vertfetch_clamped 24, d6[0]
-  vertfetch_clamped 23, d5[3]
-  vertfetch_clamped 22, d5[2]
-  vertfetch_clamped 21, d5[1]
-  vertfetch_clamped 20, d5[0]
-  vertfetch_clamped 19, d4[3]
-  vertfetch_clamped 18, d4[2]
-  vertfetch_clamped 17, d4[1]
-  vertfetch_clamped 16, d4[0]
-  vertfetch_clamped 15, d3[3]
-  vertfetch_clamped 14, d3[2]
-  vertfetch_clamped 13, d3[1]
-  vertfetch_clamped 12, d3[0]
-  vertfetch_clamped 11, d2[3]
-  vertfetch_clamped 10, d2[2]
-  vertfetch_clamped  9, d2[1]
-  vertfetch_clamped  8, d2[0]
-  vertfetch_clamped  7, d1[3]
-  vertfetch_clamped  6, d1[2]
-  vertfetch_clamped  5, d1[1]
-  vertfetch_clamped  4, d1[0]
-  vertfetch_clamped  3, d0[3]
-  vertfetch_clamped  2, d0[2]
-  vertfetch_clamped  1, d0[1]
-  vertfetch_clamped  0, d0[0]
-  1:
-  \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
-
-  /* Entry into this unrolled loop is computed as a negative index from
-   * \labelnc at the end of the block.
-   */
-  .align 4
-  vertfetch_noclamp 27, d6[3]
-  vertfetch_noclamp 26, d6[2]
-  vertfetch_noclamp 25, d6[1]
-  vertfetch_noclamp 24, d6[0]
-  vertfetch_noclamp 23, d5[3]
-  vertfetch_noclamp 22, d5[2]
-  vertfetch_noclamp 21, d5[1]
-  vertfetch_noclamp 20, d5[0]
-  vertfetch_noclamp 19, d4[3]
-  vertfetch_noclamp 18, d4[2]
-  vertfetch_noclamp 17, d4[1]
-  vertfetch_noclamp 16, d4[0]
-  vertfetch_noclamp 15, d3[3]
-  vertfetch_noclamp 14, d3[2]
-  vertfetch_noclamp 13, d3[1]
-  vertfetch_noclamp 12, d3[0]
-  vertfetch_noclamp 11, d2[3]
-  vertfetch_noclamp 10, d2[2]
-  vertfetch_noclamp  9, d2[1]
-  vertfetch_noclamp  8, d2[0]
-  vertfetch_noclamp  7, d1[3]
-  vertfetch_noclamp  6, d1[2]
-  vertfetch_noclamp  5, d1[1]
-  vertfetch_noclamp  4, d1[0]
-  vertfetch_noclamp  3, d0[3]
-  vertfetch_noclamp  2, d0[2]
-  vertfetch_noclamp  1, d0[1]
-  vertfetch_noclamp  0, d0[0]
-  \labelnc :
-
-  .purgem vertfetch_clamped
-  .purgem vertfetch_noclamp
-
-  2:        vqrshrn.u32 d20, q12, #16 - FRACTION_BITS
-            vqrshrn.u32 d21, q13, #16 - FRACTION_BITS
-            vqrshrn.u32 d22, q14, #16 - FRACTION_BITS
-            vqrshrn.u32 d23, q15, #16 - FRACTION_BITS
-.endm /*}}}*/
-
-/* Some portion of the convolution window (as much as will fit, and all of it
- * for the uchar1 cases) is kept in the register file to avoid unnecessary
- * memory accesses.  This forces the horizontal loops to be unrolled because
- * there's no indexed addressing into the register file.
- *
- * As in the fetch macro, the operations are ordered from outside to inside, so
- * that jumping into the middle of the block bypasses the unwanted window taps.
- *
- * There are several variants of the macro because of the fixed offets of the
- * taps -- the wider the maximum radius the further the centre tap is from the
- * most recently fetched data.  This means that pre-filling the window requires
- * more data that won't be used and it means that rotating the window involves
- * more mov operations.
- *
- * When the buffer gets too big the buffer at [r9] is used.
- *
- * Input:
- *      q4-q11 -- convoltion window
- *      r9 -- pointer to additional convolution window data
- * Output:
- *      r9 -- updated buffer pointer (if used)
- *      d31 -- result to be stored
- * Modifies:
- *      r12 -- temp buffer pointer
- *      q12-q13 -- temporaries for load and vext operations.
- *      q14-q15 -- intermediate sums
- */
-#define TUNED_LIST1 8, 16
-.macro hconv1_8/*{{{*/
-            vmull.u16   q14, d18, d0[0]
-            vmull.u16   q15, d19, d0[0]
-
-            ldr         r12, [pc, r5, LSL #2]
-            add         pc, pc, r12
-            bkpt
-    100:    .word 101f-100b
-            .word 102f-100b
-            .word 103f-100b
-            .word 104f-100b
-            .word 105f-100b
-            .word 106f-100b
-            .word 107f-100b
-            .word 108f-100b
-    108:    vmlal.u16   q14, d16, d2[0]
-            vmlal.u16   q15, d17, d2[0]
-            vmlal.u16   q14, d20, d2[0]
-            vmlal.u16   q15, d21, d2[0]
-    107:    vext.u16    q12, q8, q9, #1
-            vext.u16    q13, q9, q10, #7
-            vmlal.u16   q14, d24, d1[3]
-            vmlal.u16   q15, d25, d1[3]
-            vmlal.u16   q14, d26, d1[3]
-            vmlal.u16   q15, d27, d1[3]
-    106:    vext.u16    q12, q8, q9, #2
-            vext.u16    q13, q9, q10, #6
-            vmlal.u16   q14, d24, d1[2]
-            vmlal.u16   q15, d25, d1[2]
-            vmlal.u16   q14, d26, d1[2]
-            vmlal.u16   q15, d27, d1[2]
-    105:    vext.u16    q12, q8, q9, #3
-            vext.u16    q13, q9, q10, #5
-            vmlal.u16   q14, d24, d1[1]
-            vmlal.u16   q15, d25, d1[1]
-            vmlal.u16   q14, d26, d1[1]
-            vmlal.u16   q15, d27, d1[1]
-    104:    //vext.u16    q12, q8, q9, #4
-            //vext.u16    q13, q9, q10, #4
-            vmlal.u16   q14, d17, d1[0]
-            vmlal.u16   q15, d18, d1[0]
-            vmlal.u16   q14, d19, d1[0]
-            vmlal.u16   q15, d20, d1[0]
-    103:    vext.u16    q12, q8, q9, #5
-            vext.u16    q13, q9, q10, #3
-            vmlal.u16   q14, d24, d0[3]
-            vmlal.u16   q15, d25, d0[3]
-            vmlal.u16   q14, d26, d0[3]
-            vmlal.u16   q15, d27, d0[3]
-    102:    vext.u16    q12, q8, q9, #6
-            vext.u16    q13, q9, q10, #2
-            vmlal.u16   q14, d24, d0[2]
-            vmlal.u16   q15, d25, d0[2]
-            vmlal.u16   q14, d26, d0[2]
-            vmlal.u16   q15, d27, d0[2]
-    101:    vext.u16    q12, q8, q9, #7
-            vext.u16    q13, q9, q10, #1
-            vmlal.u16   q14, d24, d0[1]
-            vmlal.u16   q15, d25, d0[1]
-            vmlal.u16   q14, d26, d0[1]
-            vmlal.u16   q15, d27, d0[1]
-
-            vqrshrn.u32 d28, q14, #16
-            vqrshrn.u32 d29, q15, #16
-            vqrshrn.u16 d31, q14, #FRACTION_BITS
-
-            vmov        q8, q9
-            vmov        q9, q10
-            vmov        q10, q11
-.endm/*}}}*/
-
-.macro hconv1_16/*{{{*/
-            vmull.u16   q14, d16, d0[0]
-            vmull.u16   q15, d17, d0[0]
-
-            ldr         r12, [pc, r5, LSL #2]
-            add         pc, pc, r12
-            bkpt
-    100:    .word 101f-100b
-            .word 102f-100b
-            .word 103f-100b
-            .word 104f-100b
-            .word 105f-100b
-            .word 106f-100b
-            .word 107f-100b
-            .word 108f-100b
-            .word 109f-100b
-            .word 110f-100b
-            .word 111f-100b
-            .word 112f-100b
-            .word 113f-100b
-            .word 114f-100b
-            .word 115f-100b
-            .word 116f-100b
-    116:    //vext.u16    q12, q6, q7, #0
-            //vext.u16    q13, q10, q11, #0
-            vmlal.u16   q14, d12, d4[0]
-            vmlal.u16   q15, d13, d4[0]
-            vmlal.u16   q14, d20, d4[0]
-            vmlal.u16   q15, d21, d4[0]
-    115:    vext.u16    q12, q6, q7, #1
-            vext.u16    q13, q9, q10, #7
-            vmlal.u16   q14, d24, d3[3]
-            vmlal.u16   q15, d25, d3[3]
-            vmlal.u16   q14, d26, d3[3]
-            vmlal.u16   q15, d27, d3[3]
-    114:    vext.u16    q12, q6, q7, #2
-            vext.u16    q13, q9, q10, #6
-            vmlal.u16   q14, d24, d3[2]
-            vmlal.u16   q15, d25, d3[2]
-            vmlal.u16   q14, d26, d3[2]
-            vmlal.u16   q15, d27, d3[2]
-    113:    vext.u16    q12, q6, q7, #3
-            vext.u16    q13, q9, q10, #5
-            vmlal.u16   q14, d24, d3[1]
-            vmlal.u16   q15, d25, d3[1]
-            vmlal.u16   q14, d26, d3[1]
-            vmlal.u16   q15, d27, d3[1]
-    112:    //vext.u16    q12, q6, q7, #4
-            //vext.u16    q13, q9, q10, #4
-            vmlal.u16   q14, d13, d3[0]
-            vmlal.u16   q15, d14, d3[0]
-            vmlal.u16   q14, d19, d3[0]
-            vmlal.u16   q15, d20, d3[0]
-    111:    vext.u16    q12, q6, q7, #5
-            vext.u16    q13, q9, q10, #3
-            vmlal.u16   q14, d24, d2[3]
-            vmlal.u16   q15, d25, d2[3]
-            vmlal.u16   q14, d26, d2[3]
-            vmlal.u16   q15, d27, d2[3]
-    110:    vext.u16    q12, q6, q7, #6
-            vext.u16    q13, q9, q10, #2
-            vmlal.u16   q14, d24, d2[2]
-            vmlal.u16   q15, d25, d2[2]
-            vmlal.u16   q14, d26, d2[2]
-            vmlal.u16   q15, d27, d2[2]
-    109:    vext.u16    q12, q6, q7, #7
-            vext.u16    q13, q9, q10, #1
-            vmlal.u16   q14, d24, d2[1]
-            vmlal.u16   q15, d25, d2[1]
-            vmlal.u16   q14, d26, d2[1]
-            vmlal.u16   q15, d27, d2[1]
-    108:    //vext.u16    q12, q7, q8, #0
-            //vext.u16    q13, q9, q10, #0
-            vmlal.u16   q14, d14, d2[0]
-            vmlal.u16   q15, d15, d2[0]
-            vmlal.u16   q14, d18, d2[0]
-            vmlal.u16   q15, d19, d2[0]
-    107:    vext.u16    q12, q7, q8, #1
-            vext.u16    q13, q8, q9, #7
-            vmlal.u16   q14, d24, d1[3]
-            vmlal.u16   q15, d25, d1[3]
-            vmlal.u16   q14, d26, d1[3]
-            vmlal.u16   q15, d27, d1[3]
-    106:    vext.u16    q12, q7, q8, #2
-            vext.u16    q13, q8, q9, #6
-            vmlal.u16   q14, d24, d1[2]
-            vmlal.u16   q15, d25, d1[2]
-            vmlal.u16   q14, d26, d1[2]
-            vmlal.u16   q15, d27, d1[2]
-    105:    vext.u16    q12, q7, q8, #3
-            vext.u16    q13, q8, q9, #5
-            vmlal.u16   q14, d24, d1[1]
-            vmlal.u16   q15, d25, d1[1]
-            vmlal.u16   q14, d26, d1[1]
-            vmlal.u16   q15, d27, d1[1]
-    104:    //vext.u16    q12, q7, q8, #4
-            //vext.u16    q13, q8, q9, #4
-            vmlal.u16   q14, d15, d1[0]
-            vmlal.u16   q15, d16, d1[0]
-            vmlal.u16   q14, d17, d1[0]
-            vmlal.u16   q15, d18, d1[0]
-    103:    vext.u16    q12, q7, q8, #5
-            vext.u16    q13, q8, q9, #3
-            vmlal.u16   q14, d24, d0[3]
-            vmlal.u16   q15, d25, d0[3]
-            vmlal.u16   q14, d26, d0[3]
-            vmlal.u16   q15, d27, d0[3]
-    102:    vext.u16    q12, q7, q8, #6
-            vext.u16    q13, q8, q9, #2
-            vmlal.u16   q14, d24, d0[2]
-            vmlal.u16   q15, d25, d0[2]
-            vmlal.u16   q14, d26, d0[2]
-            vmlal.u16   q15, d27, d0[2]
-    101:    vext.u16    q12, q7, q8, #7
-            vext.u16    q13, q8, q9, #1
-            vmlal.u16   q14, d24, d0[1]
-            vmlal.u16   q15, d25, d0[1]
-            vmlal.u16   q14, d26, d0[1]
-            vmlal.u16   q15, d27, d0[1]
-
-            vqrshrn.u32 d28, q14, #16
-            vqrshrn.u32 d29, q15, #16
-            vqrshrn.u16 d31, q14, #FRACTION_BITS
-
-            vmov        q6, q7
-            vmov        q7, q8
-            vmov        q8, q9
-            vmov        q9, q10
-            vmov        q10, q11
-.endm/*}}}*/
-
-.macro hconv1_25/*{{{*/
-            vext.u16    q12, q6, q7, #7
-            vmull.u16   q14, d24, d0[0]
-            vmull.u16   q15, d25, d0[0]
-
-            ldr         r12, [pc, r5, LSL #2]
-            add         pc, pc, r12
-            bkpt
-    100:    .word 101f-100b
-            .word 102f-100b
-            .word 103f-100b
-            .word 104f-100b
-            .word 105f-100b
-            .word 106f-100b
-            .word 107f-100b
-            .word 108f-100b
-            .word 109f-100b
-            .word 110f-100b
-            .word 111f-100b
-            .word 112f-100b
-            .word 113f-100b
-            .word 114f-100b
-            .word 115f-100b
-            .word 116f-100b
-            .word 117f-100b
-            .word 118f-100b
-            .word 119f-100b
-            .word 120f-100b
-            .word 121f-100b
-            .word 122f-100b
-            .word 123f-100b
-            .word 124f-100b
-            .word 125f-100b
-    125:    vext.u16    q12, q3, q4, #6
-            vext.u16    q13, q10, q11, #0
-            vmlal.u16   q14, d24, d6[1]
-            vmlal.u16   q15, d25, d6[1]
-            vmlal.u16   q14, d26, d6[1]
-            vmlal.u16   q15, d27, d6[1]
-    124:    vext.u16    q12, q3, q4, #7
-            vext.u16    q13, q9, q10, #7
-            vmlal.u16   q14, d24, d6[0]
-            vmlal.u16   q15, d25, d6[0]
-            vmlal.u16   q14, d26, d6[0]
-            vmlal.u16   q15, d27, d6[0]
-    123:    vext.u16    q12, q4, q5, #0
-            vext.u16    q13, q9, q10, #6
-            vmlal.u16   q14, d24, d5[3]
-            vmlal.u16   q15, d25, d5[3]
-            vmlal.u16   q14, d26, d5[3]
-            vmlal.u16   q15, d27, d5[3]
-    122:    vext.u16    q12, q4, q5, #1
-            vext.u16    q13, q9, q10, #5
-            vmlal.u16   q14, d24, d5[2]
-            vmlal.u16   q15, d25, d5[2]
-            vmlal.u16   q14, d26, d5[2]
-            vmlal.u16   q15, d27, d5[2]
-    121:    vext.u16    q12, q4, q5, #2
-            vext.u16    q13, q9, q10, #4
-            vmlal.u16   q14, d24, d5[1]
-            vmlal.u16   q15, d25, d5[1]
-            vmlal.u16   q14, d26, d5[1]
-            vmlal.u16   q15, d27, d5[1]
-    120:    vext.u16    q12, q4, q5, #3
-            vext.u16    q13, q9, q10, #3
-            vmlal.u16   q14, d24, d5[0]
-            vmlal.u16   q15, d25, d5[0]
-            vmlal.u16   q14, d26, d5[0]
-            vmlal.u16   q15, d27, d5[0]
-    119:    vext.u16    q12, q4, q5, #4
-            vext.u16    q13, q9, q10, #2
-            vmlal.u16   q14, d24, d4[3]
-            vmlal.u16   q15, d25, d4[3]
-            vmlal.u16   q14, d26, d4[3]
-            vmlal.u16   q15, d27, d4[3]
-    118:    vext.u16    q12, q4, q5, #5
-            vext.u16    q13, q9, q10, #1
-            vmlal.u16   q14, d24, d4[2]
-            vmlal.u16   q15, d25, d4[2]
-            vmlal.u16   q14, d26, d4[2]
-            vmlal.u16   q15, d27, d4[2]
-    117:    vext.u16    q12, q4, q5, #6
-            vext.u16    q13, q9, q10, #0
-            vmlal.u16   q14, d24, d4[1]
-            vmlal.u16   q15, d25, d4[1]
-            vmlal.u16   q14, d26, d4[1]
-            vmlal.u16   q15, d27, d4[1]
-    116:    vext.u16    q12, q4, q5, #7
-            vext.u16    q13, q8, q9, #7
-            vmlal.u16   q14, d24, d4[0]
-            vmlal.u16   q15, d25, d4[0]
-            vmlal.u16   q14, d26, d4[0]
-            vmlal.u16   q15, d27, d4[0]
-    115:    vext.u16    q12, q5, q6, #0
-            vext.u16    q13, q8, q9, #6
-            vmlal.u16   q14, d24, d3[3]
-            vmlal.u16   q15, d25, d3[3]
-            vmlal.u16   q14, d26, d3[3]
-            vmlal.u16   q15, d27, d3[3]
-    114:    vext.u16    q12, q5, q6, #1
-            vext.u16    q13, q8, q9, #5
-            vmlal.u16   q14, d24, d3[2]
-            vmlal.u16   q15, d25, d3[2]
-            vmlal.u16   q14, d26, d3[2]
-            vmlal.u16   q15, d27, d3[2]
-    113:    vext.u16    q12, q5, q6, #2
-            vext.u16    q13, q8, q9, #4
-            vmlal.u16   q14, d24, d3[1]
-            vmlal.u16   q15, d25, d3[1]
-            vmlal.u16   q14, d26, d3[1]
-            vmlal.u16   q15, d27, d3[1]
-    112:    vext.u16    q12, q5, q6, #3
-            vext.u16    q13, q8, q9, #3
-            vmlal.u16   q14, d24, d3[0]
-            vmlal.u16   q15, d25, d3[0]
-            vmlal.u16   q14, d26, d3[0]
-            vmlal.u16   q15, d27, d3[0]
-    111:    vext.u16    q12, q5, q6, #4
-            vext.u16    q13, q8, q9, #2
-            vmlal.u16   q14, d24, d2[3]
-            vmlal.u16   q15, d25, d2[3]
-            vmlal.u16   q14, d26, d2[3]
-            vmlal.u16   q15, d27, d2[3]
-    110:    vext.u16    q12, q5, q6, #5
-            vext.u16    q13, q8, q9, #1
-            vmlal.u16   q14, d24, d2[2]
-            vmlal.u16   q15, d25, d2[2]
-            vmlal.u16   q14, d26, d2[2]
-            vmlal.u16   q15, d27, d2[2]
-    109:    vext.u16    q12, q5, q6, #6
-            vext.u16    q13, q8, q9, #0
-            vmlal.u16   q14, d24, d2[1]
-            vmlal.u16   q15, d25, d2[1]
-            vmlal.u16   q14, d26, d2[1]
-            vmlal.u16   q15, d27, d2[1]
-    108:    vext.u16    q12, q5, q6, #7
-            vext.u16    q13, q7, q8, #7
-            vmlal.u16   q14, d24, d2[0]
-            vmlal.u16   q15, d25, d2[0]
-            vmlal.u16   q14, d26, d2[0]
-            vmlal.u16   q15, d27, d2[0]
-    107:    vext.u16    q12, q6, q7, #0
-            vext.u16    q13, q7, q8, #6
-            vmlal.u16   q14, d24, d1[3]
-            vmlal.u16   q15, d25, d1[3]
-            vmlal.u16   q14, d26, d1[3]
-            vmlal.u16   q15, d27, d1[3]
-    106:    vext.u16    q12, q6, q7, #1
-            vext.u16    q13, q7, q8, #5
-            vmlal.u16   q14, d24, d1[2]
-            vmlal.u16   q15, d25, d1[2]
-            vmlal.u16   q14, d26, d1[2]
-            vmlal.u16   q15, d27, d1[2]
-    105:    vext.u16    q12, q6, q7, #2
-            vext.u16    q13, q7, q8, #4
-            vmlal.u16   q14, d24, d1[1]
-            vmlal.u16   q15, d25, d1[1]
-            vmlal.u16   q14, d26, d1[1]
-            vmlal.u16   q15, d27, d1[1]
-    104:    vext.u16    q12, q6, q7, #3
-            vext.u16    q13, q7, q8, #3
-            vmlal.u16   q14, d24, d1[0]
-            vmlal.u16   q15, d25, d1[0]
-            vmlal.u16   q14, d26, d1[0]
-            vmlal.u16   q15, d27, d1[0]
-    103:    vext.u16    q12, q6, q7, #4
-            vext.u16    q13, q7, q8, #2
-            vmlal.u16   q14, d24, d0[3]
-            vmlal.u16   q15, d25, d0[3]
-            vmlal.u16   q14, d26, d0[3]
-            vmlal.u16   q15, d27, d0[3]
-    102:    vext.u16    q12, q6, q7, #5
-            vext.u16    q13, q7, q8, #1
-            vmlal.u16   q14, d24, d0[2]
-            vmlal.u16   q15, d25, d0[2]
-            vmlal.u16   q14, d26, d0[2]
-            vmlal.u16   q15, d27, d0[2]
-    101:    vext.u16    q12, q6, q7, #6
-            vext.u16    q13, q7, q8, #0
-            vmlal.u16   q14, d24, d0[1]
-            vmlal.u16   q15, d25, d0[1]
-            vmlal.u16   q14, d26, d0[1]
-            vmlal.u16   q15, d27, d0[1]
-
-            vqrshrn.u32 d28, q14, #16
-            vqrshrn.u32 d29, q15, #16
-            vqrshrn.u16 d31, q14, #FRACTION_BITS
-
-            vmov        d7, d9
-            vmov        q4, q5
-            vmov        q5, q6
-            vmov        q6, q7
-            vmov        q7, q8
-            vmov        q8, q9
-            vmov        q9, q10
-            vmov        q10, q11
-.endm/*}}}*/
-
-#define TUNED_LIST4 6, 12
-.macro hconv4_6/*{{{*/
-            vmull.u16   q14, d14, d0[0]
-            vmull.u16   q15, d15, d0[0]
-
-            ldr         r12, [pc, r5, LSL #2]
-            add         pc, pc, r12
-            bkpt
-    100:    .word 101f-100b
-            .word 102f-100b
-            .word 103f-100b
-            .word 104f-100b
-            .word 105f-100b
-            .word 106f-100b
-    106:    vmlal.u16   q14, d8,  d1[2]
-            vmlal.u16   q15, d9,  d1[2]
-            vmlal.u16   q14, d20, d1[2]
-            vmlal.u16   q15, d21, d1[2]
-    105:    vmlal.u16   q14, d9,  d1[1]
-            vmlal.u16   q15, d10, d1[1]
-            vmlal.u16   q14, d19, d1[1]
-            vmlal.u16   q15, d20, d1[1]
-    104:    vmlal.u16   q14, d10, d1[0]
-            vmlal.u16   q15, d11, d1[0]
-            vmlal.u16   q14, d18, d1[0]
-            vmlal.u16   q15, d19, d1[0]
-    103:    vmlal.u16   q14, d11, d0[3]
-            vmlal.u16   q15, d12, d0[3]
-            vmlal.u16   q14, d17, d0[3]
-            vmlal.u16   q15, d18, d0[3]
-    102:    vmlal.u16   q14, d12, d0[2]
-            vmlal.u16   q15, d13, d0[2]
-            vmlal.u16   q14, d16, d0[2]
-            vmlal.u16   q15, d17, d0[2]
-    101:    vmlal.u16   q14, d13, d0[1]
-            vmlal.u16   q15, d14, d0[1]
-            vmlal.u16   q14, d15, d0[1]
-            vmlal.u16   q15, d16, d0[1]
-
-            vqrshrn.u32 d28, q14, #16
-            vqrshrn.u32 d29, q15, #16
-            vqrshrn.u16 d31, q14, #FRACTION_BITS
-
-            vmov        q4, q5
-            vmov        q5, q6
-            vmov        q6, q7
-            vmov        q7, q8
-            vmov        q8, q9
-            vmov        q9, q10
-            vmov        q10, q11
-.endm/*}}}*/
-
-.macro hconv4_12/*{{{*/
-            vmull.u16   q14, d8, d0[0]
-            vmull.u16   q15, d9, d0[0]
-
-            ldr         r12, [pc, r5, LSL #2]
-            add         pc, pc, r12
-            bkpt
-    100:    .word 101f-100b
-            .word 102f-100b
-            .word 103f-100b
-            .word 104f-100b
-            .word 105f-100b
-            .word 106f-100b
-            .word 107f-100b
-            .word 108f-100b
-            .word 109f-100b
-            .word 110f-100b
-            .word 111f-100b
-            .word 112f-100b
-    112:    add         r12, r9, #0x1a0
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-            vmlal.u16   q14, d24, d3[0]
-            vmlal.u16   q15, d25, d3[0]
-            vmlal.u16   q14, d20, d3[0]
-            vmlal.u16   q15, d21, d3[0]
-    111:    add         r12, r9, #0x1a8
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12:64]
-            vmlal.u16   q14, d24, d2[3]
-            vmlal.u16   q15, d25, d2[3]
-            vmlal.u16   q14, d19, d2[3]
-            vmlal.u16   q15, d20, d2[3]
-    110:    add         r12, r9, #0x1b0
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-            vmlal.u16   q14, d24, d2[2]
-            vmlal.u16   q15, d25, d2[2]
-            vmlal.u16   q14, d18, d2[2]
-            vmlal.u16   q15, d19, d2[2]
-    109:    add         r12, r9, #0x1b8
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12:64]
-            vmlal.u16   q14, d24, d2[1]
-            vmlal.u16   q15, d25, d2[1]
-            vmlal.u16   q14, d17, d2[1]
-            vmlal.u16   q15, d18, d2[1]
-    108:    add         r12, r9, #0x1c0
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-            vmlal.u16   q14, d24, d2[0]
-            vmlal.u16   q15, d25, d2[0]
-            vmlal.u16   q14, d16, d2[0]
-            vmlal.u16   q15, d17, d2[0]
-    107:    add         r12, r9, #0x1c8
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12:64]
-            vmlal.u16   q14, d24, d1[3]
-            vmlal.u16   q15, d25, d1[3]
-            vmlal.u16   q14, d15, d1[3]
-            vmlal.u16   q15, d16, d1[3]
-    106:    add         r12, r9, #0x1d0
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-            vmlal.u16   q14, d24, d1[2]
-            vmlal.u16   q15, d25, d1[2]
-            vmlal.u16   q14, d14, d1[2]
-            vmlal.u16   q15, d15, d1[2]
-    105:    add         r12, r9, #0x1d8
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12:64]
-            vmlal.u16   q14, d24, d1[1]
-            vmlal.u16   q15, d25, d1[1]
-            vmlal.u16   q14, d13, d1[1]
-            vmlal.u16   q15, d14, d1[1]
-    104:    add         r12, r9, #0x1e0
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-            vmlal.u16   q14, d24, d1[0]
-            vmlal.u16   q15, d25, d1[0]
-            vmlal.u16   q14, d12, d1[0]
-            vmlal.u16   q15, d13, d1[0]
-    103:    add         r12, r9, #0x1e8
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12:64]
-            vmlal.u16   q14, d24, d0[3]
-            vmlal.u16   q15, d25, d0[3]
-            vmlal.u16   q14, d11, d0[3]
-            vmlal.u16   q15, d12, d0[3]
-    102:    add         r12, r9, #0x1f0
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-            vmlal.u16   q14, d24, d0[2]
-            vmlal.u16   q15, d25, d0[2]
-            vmlal.u16   q14, d10, d0[2]
-            vmlal.u16   q15, d11, d0[2]
-    101:    add         r12, r9, #0x1f8
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]
-            vmlal.u16   q14, d24, d0[1]
-            vmlal.u16   q15, d8,  d0[1]
-            vmlal.u16   q14, d9,  d0[1]
-            vmlal.u16   q15, d10, d0[1]
-
-            vqrshrn.u32 d28, q14, #16
-            vqrshrn.u32 d29, q15, #16
-            vqrshrn.u16 d31, q14, #FRACTION_BITS
-
-            vst1.u8     {q4}, [r9:128]!
-            bic         r9, r9, #0x200
-            vmov        q4, q5
-            vmov        q5, q6
-            vmov        q6, q7
-            vmov        q7, q8
-            vmov        q8, q9
-            vmov        q9, q10
-            vmov        q10, q11
-.endm/*}}}*/
-
-.macro hconv4_25/*{{{*/
-            add         r12, r9, #0x198
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12:64]
-            vmull.u16   q14, d24, d0[0]
-            vmull.u16   q15, d25, d0[0]
-
-            ldr         r12, [pc, r5, LSL #2]
-            add         pc, pc, r12
-            bkpt
-    100:    .word 101f-100b
-            .word 102f-100b
-            .word 103f-100b
-            .word 104f-100b
-            .word 105f-100b
-            .word 106f-100b
-            .word 107f-100b
-            .word 108f-100b
-            .word 109f-100b
-            .word 110f-100b
-            .word 111f-100b
-            .word 112f-100b
-            .word 113f-100b
-            .word 114f-100b
-            .word 115f-100b
-            .word 116f-100b
-            .word 117f-100b
-            .word 118f-100b
-            .word 119f-100b
-            .word 120f-100b
-            .word 121f-100b
-            .word 122f-100b
-            .word 123f-100b
-            .word 124f-100b
-            .word 125f-100b
-    125:    add         r12, r9, #0x0d0
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-            vmlal.u16   q14, d24, d6[1]
-            vmlal.u16   q15, d25, d6[1]
-            vmlal.u16   q14, d20, d6[1]
-            vmlal.u16   q15, d21, d6[1]
-    124:    add         r12, r9, #0x0d8
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12]
-            vmlal.u16   q14, d24, d6[0]
-            vmlal.u16   q15, d25, d6[0]
-            vmlal.u16   q14, d19, d6[0]
-            vmlal.u16   q15, d20, d6[0]
-    123:    add         r12, r9, #0x0e0
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-            vmlal.u16   q14, d24, d5[3]
-            vmlal.u16   q15, d25, d5[3]
-            vmlal.u16   q14, d18, d5[3]
-            vmlal.u16   q15, d19, d5[3]
-    122:    add         r12, r9, #0x0e8
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12]
-            vmlal.u16   q14, d24, d5[2]
-            vmlal.u16   q15, d25, d5[2]
-            vmlal.u16   q14, d17, d5[2]
-            vmlal.u16   q15, d18, d5[2]
-    121:    add         r12, r9, #0x0f0
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-            vmlal.u16   q14, d24, d5[1]
-            vmlal.u16   q15, d25, d5[1]
-            vmlal.u16   q14, d16, d5[1]
-            vmlal.u16   q15, d17, d5[1]
-    120:    add         r12, r9, #0x0f8
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12]
-            vmlal.u16   q14, d24, d5[0]
-            vmlal.u16   q15, d25, d5[0]
-            vmlal.u16   q14, d15, d5[0]
-            vmlal.u16   q15, d16, d5[0]
-    119:    add         r12, r9, #0x100
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-            vmlal.u16   q14, d24, d4[3]
-            vmlal.u16   q15, d25, d4[3]
-            vmlal.u16   q14, d14, d4[3]
-            vmlal.u16   q15, d15, d4[3]
-    118:    add         r12, r9, #0x108
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12]
-            vmlal.u16   q14, d24, d4[2]
-            vmlal.u16   q15, d25, d4[2]
-            vmlal.u16   q14, d13, d4[2]
-            vmlal.u16   q15, d14, d4[2]
-    117:    add         r12, r9, #0x110
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-            vmlal.u16   q14, d24, d4[1]
-            vmlal.u16   q15, d25, d4[1]
-            vmlal.u16   q14, d12, d4[1]
-            vmlal.u16   q15, d13, d4[1]
-    116:    add         r12, r9, #0x118
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12]
-            vmlal.u16   q14, d24, d4[0]
-            vmlal.u16   q15, d25, d4[0]
-            vmlal.u16   q14, d11, d4[0]
-            vmlal.u16   q15, d12, d4[0]
-    115:    add         r12, r9, #0x120
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-            vmlal.u16   q14, d24, d3[3]
-            vmlal.u16   q15, d25, d3[3]
-            vmlal.u16   q14, d10, d3[3]
-            vmlal.u16   q15, d11, d3[3]
-    114:    add         r12, r9, #0x128
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12]
-            vmlal.u16   q14, d24, d3[2]
-            vmlal.u16   q15, d25, d3[2]
-            vmlal.u16   q14, d9,  d3[2]
-            vmlal.u16   q15, d10, d3[2]
-    113:    add         r12, r9, #0x130
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-            vmlal.u16   q14, d24, d3[1]
-            vmlal.u16   q15, d25, d3[1]
-            vmlal.u16   q14, d8,  d3[1]
-            vmlal.u16   q15, d9,  d3[1]
-    112:    add         r12, r9, #0x138
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12]
-                                            add         r12, r9, #0x1f8
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d26}, [r12:64]
-            vmlal.u16   q14, d24, d3[0]
-            vmlal.u16   q15, d25, d3[0]
-            vmlal.u16   q14, d26, d3[0]   @ Could be d7, without the load, right?
-            vmlal.u16   q15, d8,  d3[0]
-    111:    add         r12, r9, #0x140
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-                                            add         r12, r9, #0x1f0
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d26,d27}, [r12:128]
-            vmlal.u16   q14, d24, d2[3]
-            vmlal.u16   q15, d25, d2[3]
-            vmlal.u16   q14, d26, d2[3]
-            vmlal.u16   q15, d27, d2[3]
-    110:    add         r12, r9, #0x148
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12]
-                                            add         r12, r9, #0x1e8
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d26}, [r12:64]!
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d27}, [r12:64]
-            vmlal.u16   q14, d24, d2[2]
-            vmlal.u16   q15, d25, d2[2]
-            vmlal.u16   q14, d26, d2[2]
-            vmlal.u16   q15, d27, d2[2]
-    109:    add         r12, r9, #0x150
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-                                            add         r12, r9, #0x1e0
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d26,d27}, [r12:128]
-            vmlal.u16   q14, d24, d2[1]
-            vmlal.u16   q15, d25, d2[1]
-            vmlal.u16   q14, d26, d2[1]
-            vmlal.u16   q15, d27, d2[1]
-    108:    add         r12, r9, #0x158
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12]
-                                            add         r12, r9, #0x1d8
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d26}, [r12:64]!
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d27}, [r12:64]
-            vmlal.u16   q14, d24, d2[0]
-            vmlal.u16   q15, d25, d2[0]
-            vmlal.u16   q14, d26, d2[0]
-            vmlal.u16   q15, d27, d2[0]
-    107:    add         r12, r9, #0x160
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-                                            add         r12, r9, #0x1d0
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d26,d27}, [r12:128]
-            vmlal.u16   q14, d24, d1[3]
-            vmlal.u16   q15, d25, d1[3]
-            vmlal.u16   q14, d26, d1[3]
-            vmlal.u16   q15, d27, d1[3]
-    106:    add         r12, r9, #0x168
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12]
-                                            add         r12, r9, #0x1c8
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d26}, [r12:64]!
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d27}, [r12:64]
-            vmlal.u16   q14, d24, d1[2]
-            vmlal.u16   q15, d25, d1[2]
-            vmlal.u16   q14, d26, d1[2]
-            vmlal.u16   q15, d27, d1[2]
-    105:    add         r12, r9, #0x170
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-                                            add         r12, r9, #0x1c0
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d26,d27}, [r12:128]
-            vmlal.u16   q14, d24, d1[1]
-            vmlal.u16   q15, d25, d1[1]
-            vmlal.u16   q14, d26, d1[1]
-            vmlal.u16   q15, d27, d1[1]
-    104:    add         r12, r9, #0x178
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12]
-                                            add         r12, r9, #0x1b8
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d26}, [r12:64]!
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d27}, [r12:64]
-            vmlal.u16   q14, d24, d1[0]
-            vmlal.u16   q15, d25, d1[0]
-            vmlal.u16   q14, d26, d1[0]
-            vmlal.u16   q15, d27, d1[0]
-    103:    add         r12, r9, #0x180
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]
-                                            add         r12, r9, #0x1b0
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d26,d27}, [r12:128]
-            vmlal.u16   q14, d24, d0[3]
-            vmlal.u16   q15, d25, d0[3]
-            vmlal.u16   q14, d26, d0[3]
-            vmlal.u16   q15, d27, d0[3]
-    102:    add         r12, r9, #0x188
-            bic         r12, r12, #0x200
-            vld1.u16    {d24}, [r12:64]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d25}, [r12]
-                                            add         r12, r9, #0x1a8
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d26}, [r12:64]!
-                                            bic         r12, r12, #0x200
-                                            vld1.u16    {d27}, [r12:64]
-            vmlal.u16   q14, d24, d0[2]
-            vmlal.u16   q15, d25, d0[2]
-            vmlal.u16   q14, d26, d0[2]
-            vmlal.u16   q15, d27, d0[2]
-    101:    add         r12, r9, #0x190
-            bic         r12, r12, #0x200
-            vld1.u16    {d24,d25}, [r12:128]!
-            bic         r12, r12, #0x200
-            vld1.u16    {d26,d27}, [r12:128]
-            vmlal.u16   q14, d24, d0[1]
-            vmlal.u16   q15, d25, d0[1]
-            vmlal.u16   q14, d26, d0[1]
-            vmlal.u16   q15, d27, d0[1]
-
-            vqrshrn.u32 d28, q14, #16
-            vqrshrn.u32 d29, q15, #16
-            vqrshrn.u16 d31, q14, #FRACTION_BITS
-
-            vst1.u8     {q4}, [r9:128]!
-            bic         r9, r9, #0x200
-            vmov        q4, q5
-            vmov        q5, q6
-            vmov        q6, q7
-            vmov        q7, q8
-            vmov        q8, q9
-            vmov        q9, q10
-            vmov        q10, q11
-.endm/*}}}*/
-
-/* Dedicated function wrapper for the fetch macro, for the cases where
- * performance isn't that important, to keep code size down.
- */
-PRIVATE(fetch_generic_asm)
-            push        {r10,r11}
-            fetch
-            pop         {r10,r11}
-            bx          lr
-END(fetch_generic_asm)
-
-
-/* Fetch the next (16 - (r10 & 15)) columns of data, avoiding reading memory
- * beyond that limit, and filling the rest of the vector with the last legal
- * pixel.
- * Result is in q10 and q11.  q8 and q9 are filled with the first legal pixel.
- * Note: This function can read beyond the right edge of input if the image is
- * narrower than 16 bytes.
- */
-PRIVATE(fetch_clampleft1)
-            push        {r12,lr}
-            bl          fetch_generic_asm
-            vdup.u16    q8, d20[0]
-            vdup.u16    q9, d20[0]
-            ands        r12, r10, #15
-            beq         1f
-            sub         r1, r1, r12
-            sub         r10, r10, r12
-            sub         sp, sp, #32
-            vst1.u16    {q10,q11}, [sp]
-            sub         r12, sp, r12, LSL #1
-            sub         sp, sp, #32
-            vst1.u16    {q8,q9}, [sp]
-            vld1.u16    {q10,q11}, [r12]
-            add         sp, sp, #64
-1:          pop         {r12,pc}
-END(fetch_clampleft1)
-
-PRIVATE(fetch_clampleft4)
-            push        {r12,lr}
-            bl          fetch_generic_asm
-            vmov.u16    d16, d20
-            vmov.u16    d17, d20
-            vmov.u16    d18, d20
-            vmov.u16    d19, d20
-            ands        r12, r10, #15
-            beq         1f
-            sub         r1, r1, r12
-            sub         r10, r10, r12
-            sub         sp, sp, #32
-            vst1.u16    {q10-q11}, [sp]
-            sub         r12, sp, r12, LSL #1
-            sub         sp, sp, #32
-            vst1.u16    {q8,q9}, [sp]
-            vld1.u16    {q10,q11}, [r12]
-            add         sp, sp, #64
-1:          pop         {r12,pc}
-END(fetch_clampleft4)
-
-/* Fetch only the next (r11 & 15) (where 0 means 16) columns of data, avoiding
- * reading memory beyond that limit, and filling the rest of the vector with
- * the last legal pixel.
- * Result is in q10 and q11.  q12 and q13 are filled with the last legal pixel.
- * Note: This function can read beyond the left edge of input if the image is
- * narrower than 16 bytes.
- */
-PRIVATE(fetch_clampright1)
-            push        {r12, lr}
-            rsb         r12, r11, #0
-            ands        r12, r12, #15
-            beq         1f
-            sub         r1, r1, r12
-            bl          fetch_generic_asm
-            vdup.u16    q12, d23[3]
-            vdup.u16    q13, d23[3]
-            rsb         r12, r11, #0
-            and         r12, r12, #15
-            sub         sp, sp, #32
-            vst1.u16    {q12,q13}, [sp]
-            sub         sp, sp, #32
-            add         r12, sp, r12, LSL #1
-            vst1.u16    {q10,q11}, [sp]
-            vld1.u16    {q10,q11}, [r12]
-            add         sp, sp, #64
-            pop         {r12,pc}
-1:          bl          fetch_generic_asm
-            vdup.u16    q12, d23[3]
-            vdup.u16    q13, d23[3]
-            pop         {r12,pc}
-END(fetch_clampright1)
-
-PRIVATE(fetch_clampright4)
-            push        {r12, lr}
-            rsb         r12, r11, #0
-            ands        r12, r12, #15
-            beq         1f
-            sub         r1, r1, r12
-            bl          fetch_generic_asm
-            vmov.u16    d24, d23
-            vmov.u16    d25, d23
-            vmov.u16    d26, d23
-            vmov.u16    d27, d23
-            rsb         r12, r11, #0
-            and         r12, r12, #15
-            sub         sp, sp, #32
-            vst1.u16    {q12-q13}, [sp]
-            sub         sp, sp, #32
-            add         r12, sp, r12, LSL #1
-            vst1.u16    {q10,q11}, [sp]
-            vld1.u16    {q10,q11}, [r12]
-            add         sp, sp, #64
-            pop         {r12,pc}
-1:          bl          fetch_generic_asm
-            vmov.u16    d24, d23
-            vmov.u16    d25, d23
-            vmov.u16    d26, d23
-            vmov.u16    d27, d23
-            pop         {r12,pc}
-END(fetch_clampright4)
-
-/* Given values in q10 and q11, and an index in r11, sweep the (r11 & 15)th
- * value across to fill the rest of the register pair.  Used for filling the
- * right hand edge of the window when reading too close to the right hand edge
- * of the image.
- * Also returns a dup-ed copy of the last element in q12 for the tail-fill
- * case (this happens incidentally in common path, but must be done
- * deliberately in the fast-out path).
- */
-PRIVATE(prefill_sweepright1)
-            ands        r12, r11, #15
-            beq         1f
-            sub         r12, r12, #1
-            sub         sp, sp, #64
-            vst1.u16    {q10,q11}, [sp]
-            add         r12, sp, r12, LSL #1
-            vld1.u16    {d24[],d25[]}, [r12]
-            vld1.u16    {d26[],d27[]}, [r12]
-            vst1.u16    {q12,q13}, [r12]
-            vld1.u16    {q10,q11}, [sp]
-            add         sp, sp, #64
-            bx          lr
-1:          vdup.u16    q12, d23[3]
-            vdup.u16    q13, d23[3]
-            bx          lr
-END(prefill_sweepright1)
-
-PRIVATE(prefill_sweepright4)
-            ands        r12, r11, #15
-            beq         1f
-            sub         r12, r12, #4
-            sub         sp, sp, #64
-            vst1.u16    {q10,q11}, [sp]
-            add         r12, sp, r12, LSL #1
-            vld1.u64    {d24}, [r12]
-            vld1.u64    {d25}, [r12]
-            vld1.u64    {d26}, [r12]
-            vld1.u64    {d27}, [r12]
-            vst1.u16    {q12,q13}, [r12]
-            vld1.u16    {q10,q11}, [sp]
-            add         sp, sp, #64
-            bx          lr
-1:          vmov.u16    d24, d23
-            vmov.u16    d25, d23
-            vmov.u16    d26, d23
-            vmov.u16    d27, d23
-            bx          lr
-END(prefill_sweepright4)
-
-/* The main loop keeps a sliding window of data that has already been convolved
- * in the vertical axis for the current line.  This usually stays in the
- * register file, but spills to memory for large windows.  The first thing that
- * needs to be done at start-up is to fill this window with image data, taking
- * into account the padding needed if the left or right edges of the image fall
- * within this window.
- */
-
-/* Because the window is in the register file writes to it cannot be indexed
- * by another register.  Consequently the fill loops are unrolled to address
- * the registers directly.  This macro distinguishes between writes to the
- * register file and writes to the spill buffer (indicated by a destination
- * register named xx).
- */
-.macro prefill_out ra, rb, sra, srb, srb_hi
-  .ifc \ra,xx
-    .ifc \rb,xx
-            vst1.u16    {\sra,\srb}, [r9:128]!
-    .else
-            /* this case is used only for the last tap of uchar1 r=25 */
-            /* discard \sra */
-            vmov.u16    \rb, \srb_hi
-    .endif
-  .else
-    .ifnc \ra,\sra
-            vmov.u16    \ra, \sra
-    .endif
-    .ifnc \rb,\srb
-            vmov.u16    \rb, \srb
-    .endif
-  .endif
-.endm
-
-/* This macro provides the list of registers representing the window, and the
- * cases where the register file is too small and a spill buffer is used
- * instead.
- * Since several specialisations of each function are generated, this also
- * culls superfluous iterations, and sets the variable `i` for subsequent
- * macros indicating the current index into the window.
- */
-.macro prefill_list, macro, nextmacro, max_r, step, label
-  .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
-    .if windowsize >= (\line * 16)
-      .set i, windowsize - (\line * 16)
-\label\macro\line:
-            prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
-    .endif
-  .endm
-  .if \step > 1
-            ifneeded \macro \nextmacro, 13, 12, xx, xx,  \step, \label
-            ifneeded \macro \nextmacro, 12, 11, xx, xx,  \step, \label
-            ifneeded \macro \nextmacro, 11, 10, xx, xx,  \step, \label
-            ifneeded \macro \nextmacro, 10,  9, xx, xx,  \step, \label
-            ifneeded \macro \nextmacro,  9,  8, xx, xx,  \step, \label
-            ifneeded \macro \nextmacro,  8,  7, xx, xx,  \step, \label
-            ifneeded \macro \nextmacro,  7,  6, xx, xx,  \step, \label
-            ifneeded \macro \nextmacro,  6,  5, xx, xx,  \step, \label
-            ifneeded \macro \nextmacro,  5,  4, xx, xx,  \step, \label
-            ifneeded \macro \nextmacro,  4,  3, xx, xx,  \step, \label
-  .else
-            /* q3 normally contains the coefficient table, but it's not fully
-             * used.  In the uchar1, r=25 case the other half of q3 is used for
-             * the last two window taps to avoid falling out to memory.
-             */
-            ifneeded \macro \nextmacro,  4,  3, xx, d7,   \step, \label
-  .endif
-            ifneeded \macro \nextmacro,  3,  2, q4, q5,   \step, \label
-            ifneeded \macro \nextmacro,  2,  1, q6, q7,   \step, \label
-            ifneeded \macro \nextmacro,  1,  0, q8, q9,   \step, \label
-
-\label\macro\()0:
-            b           \label\()_end
-  .purgem ifneeded
-.endm
-
-/* These macros represent the possible stages of filling the window.
- * Each macro is unrolled enough times that it can fill the entire window
- * itself, but normally it will have to hand control to subsequent macros
- * part-way through and this is done using labels named \next and \after, where
- * \next is the next macro starting at the same window position and \after is
- * the next macro starting after the current window position.
- */
-
-/* leftfill: v8 and v9 contain the left padding value.  While the window
- * extends outside of the image on the left-hand side, and at least 16 more
- * padding values are needed in the window, store v8 and v9 into the window.
- * Otherwise skip forward to storing image data.
- */
-.macro prefill_leftfill, next, after, ra, rb, step
-            cmp         r10, #i+16
-            blo         \next
-            prefill_out \ra, \rb, q8, q9, d19
-.endm
-
-/* leftedge: The very first non-fill or partial-fill chunk from the image is
- * already loaded (as it was used to calculate the left padding value), so
- * store it here, and then drop into the regular load/store cycle in the next
- * macro.
- */
-.macro prefill_leftedge, next, after, ra, rb, step
-1:          prefill_out \ra, \rb, q10, q11, d23
-            b           \after
-.endm
-
-/* dofetch: Copy chunks of the image into the window without any complications
- * from edge conditions.
- */
-.macro prefill_dofetch, next, after, ra, rb, step
-            cmp         r11, #i+16
-            bls         \next
-            bl          fetch_generic_asm
-            prefill_out \ra, \rb, q10, q11, d23
-.endm
-
-/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
- * the right-hand edge of the image.  In that case sweep the last valid pixel
- * across the rest of the chunk, and in either case prepare padding data in v12
- * and v13 for the next macro.  This is done in fetch_clampright.
- * This only happens once before going on to the next macro.
- * Sometimes leftedge also covers the rightedge case, in which case this has
- * to be skipped altogether.
- */
-.macro prefill_rightedge, next, after, ra, rb, step
-            cmp         r11, #i
-            bls         \next
-            bl          fetch_clampright\step
-            prefill_out \ra, \rb, q10, q11, d23
-            b           \after
-.endm
-
-/* rightfill: The rest of the window is simply filled with right padding from
- * v12 and v13.
- */
-.macro prefill_rightfill, next, after, ra, rb, step
-            prefill_out \ra, \rb, q12, q13, d25
-.endm
-
-/* Here all of the macros above are unrolled and laid out in the proper order.
- */
-.macro prefill_body, max_r, step, label
-            prefill_list leftfill,  leftedge,   \max_r, \step, \label
-            prefill_list leftedge,  dofetch,    \max_r, \step, \label
-            prefill_list dofetch,   rightedge,  \max_r, \step, \label
-            prefill_list rightedge, rightfill,  \max_r, \step, \label
-            prefill_list rightfill, oops,       \max_r, \step, \label
-\label\()_end:
-.endm
-
-/* Fill the convolution window with context data.  The aim here is to load
- * exactly 2*r columns, and in the main loop to read as many columns as will be
- * written.  This is complicated by the window being divided into chunks at
- * register boundaries, and the need to handle cases when the input starts very
- * close to the left or right (or both) edges of the image and the need to fill
- * the spaces that leaves with left and right edge padding values.
- *
- * Input:
- *      r1 -- src
- *      r2 -- pitch
- *      r3 -- count
- *      r4 -- available image data right of src pointer
- *      r5 -- r
- *      r6 -- rup
- *      r7 -- rdn
- *      r8 -- available image data left of src pointer
- *      r9 -- buffer (if needed)
- * Output:
- *      r4 -= min(inlen, count + windowsize - centertap)
- *      r1 += min(inlen, count + windowsize - centertap)
- * Modifies:
- *      r10 -- fill start index in the window
- *      r11 -- fill stop index in the window
- *      r12 -- scratch
- */
-.macro prefill step=1, max_r=25, label=xx
-.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
-.set centertap, (windowsize - \max_r * \step)
-            mov         r10, #centertap
-            subs        r10, r10, r8
-            movlo       r10, #0
-
-            subs        r11, r4, #windowsize - centertap
-            movhs       r11, #0
-            add         r11, r11, #windowsize
-
-            /* r10 indicates where in the window legal image data begins.
-             * r11 indicates where in the window legal image date ends.
-             * When starting near the centre of a large image these would be
-             * zero and windowsize respectively, but when starting near the
-             * edges this can change.
-             * When starting on the leftmost pixel, r10 will be centertap.
-             * When starting on the rightmost pixel, r11 will be centertap+1.
-             */
-
-            /* r4 indicates how much data there is between the current pointers
-             * and the right edge of the image.  The pointers currently point
-             * to the data needed at centertap.  The subsequent code will
-             * consume (windowsize - r10) data, but only the data from
-             * centertap to windowsize comes out of r4's budget.
-             */
-1:          subs        r4, r4, #windowsize - centertap
-            movlo       r4, #0
-
-            /* And the pointers need to rewind to the start of the window.
-             */
-            sub         r1, r1, #centertap
-
-            /* Unless x8 indicated that there wasn't that much data available.
-             */
-            add         r1, r1, r10
-
-
-            /* Get the first chunk, and add padding to align it to the window
-             * if necessary.
-             */
-            bl          fetch_clampleft\step
-
-            /* Sometimes the start and the end of the window are in the same
-             * chunk.  In that case both ends need filler at the outset.
-             */
-            sub         r12, r11, #1
-            eor         r12,  r10, r12
-            cmp         r12, #16
-            bllo        prefill_sweepright\step
-
-            /* Iterate through all the points in the window and fill them in
-             * with padding or image data as needed.
-             */
-            prefill_body \max_r, \step, \label
-.endm
-
-/* The main body of the convolve functions.  Having already pre-filled the
- * convolution window with 2*r input values, the logic settles into a regular
- * pattern of reading and writing at a 1:1 rate until either input or output
- * expires.  The input leads the output by r values, so when processing all the
- * way to the right-hand edge, or within r pixels of that edge, the input will
- * run out first.  In the case of very narrow images, or sub-windows starting
- * near the right edge, the input may already have run out while the
- * convolution window was being filled and this loop will start with a
- * zero-length input.
- *
- * Once the input runs out, the rest of the output must be processed by padding
- * the remainder of the window with pad value from the last valid pixel from
- * the source.
- *
- * Input:
- *      r0 = dst
- *      r1 = src
- *      r2 = pitch
- *      r3 = count
- *      r4 = inlen
- *      r5 = r
- *      r6 = rup
- *      r7 = rdn
- *      r9 = buffer
- * Modifies
- *      r8 = fetch code pointer
- */
-.macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
-
-            /* If x4 >= x3 then there's no need for clipping.  The main loop
-             * needs to exit when either x3 or x4 runs out, so clamp x4 to be
-             * no greater than x3 and use x4 for the loop.
-             * However, if x4 comes out of the loop with less than 16 bytes
-             * left, a partial read would be necessary to avoid reading beyond
-             * the end of the image.  To avoid this, clamp x4 to the next
-             * multiple of 16, which is still sufficient to force it out of the
-             * loop but doesn't imply a rewind.
-             */
-            add         r12, r3, #15
-            bic         r12, r12, #15
-            cmp         r4, r12
-            movhi       r4, r12
-
-            /* First calculate the entry-point into the internal fetch logic.
-             * This is done so the same function can service several kernel
-             * sizes.
-             */
-            ldr         r8, 3f
-1:          add         r8, r8, pc
-            sub         r8, r5, LSL #5
-            sub         r8, r5, LSL #4
-            cmp         r5, r6
-            cmpeq       r5, r7
-            beq         5f
-
-            /* if (r != rup || r != rdn) then the address-clamping table should
-             * be used rather than the short-cut version.
-             */
-            ldr         r8, 3f+4
-2:          add         r8, r8, pc
-            sub         r8, r5, LSL #6
-            b           5f
-            .align 3
-3:          .word       \labelnc-1b-8
-            .word       \labelc-2b-8
-
-            /* Main loop: ... */
-            .align 4
-3:          /* first perform a vertical convolution from memory to get the next
-             * 16 taps of the horizontal window into the register file...
-             */
-            fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=r8
-
-            /* ...then perform a horizontal convolution on that window to
-             * produce eight output bytes, and slide the window along.
-             * This has to be done twice to match the 16-way vertical pass.
-             * It would be preferable to have twice the work done in \core, but
-             * that would demand yet another variant on those macros and would
-             * perturb the register allocation severely.
-             */
-            \core
-            vst1.u8     {d31}, [r0]!
-            \core
-            vst1.u8     {d31}, [r0]!
-
-            sub         r3, r3, #16
-5:          subs        r4, r4, #16
-            bhi         3b
-            /* Here there's 16 or fewer bytes available before the edge of the
-             * source image.  x4 holds that count minus 16 (because it was
-             * decremented before the first iteration ran).  The last read may
-             * not be a whole chunk, and beyond that a fill value must be used.
-             *
-             * Of course, none of that matters if there's no more output to
-             * produce...
-             */
-            cmp         r3, #0
-            beq         5f
-
-            /* Oh well. */
-            adds        r4, r4, #16
-            bne         1f
-  .if \step==1
-            vdup.u16    q10, d19[3]
-            vdup.u16    q11, d19[3]
-  .else
-            vmov.u64    d20, d19
-            vmov.u64    d21, d19
-            vmov.u64    d22, d19
-            vmov.u64    d23, d19
-  .endif
-            b           3f
-
-            /* To avoid reading past end of input, rewind pointers by (16-r4)
-             * to ensure that they're exactly 16 bytes from the edge.
-             */
-1:          mov         r11, r4
-            bl          fetch_clampright\step
-            /* Now to put this padding to use, perform any remaining
-             * iterations.  This is done at half the rate of the main loop,
-             * because there's no longer pressure from a 16-lane window filler.
-             */
-3:          \core
-  .if \step==1
-            vdup.u16    q11, d23[3]
-  .else
-            vmov.u64    d22, d23
-  .endif
-            subs        r3, r3, #8
-            blo         4f
-            vst1.u8     {d31}, [r0]!
-            bne         3b
-            b           5f
-
-            /* If the final iteration contained 0 < l < 8 values, then perform
-             * a piecewise store of the final vector.
-             */
-4:          tst         r3, #4
-            beq         1f
-            vst1.u32    {d31[0]}, [r0]!
-            vext.u8     d31, d31, d31, #4
-1:          tst         r3, #2
-            beq         1f
-            vst1.u16    {d31[0]}, [r0]!
-            vext.u8     d31, d31, d31, #2
-1:          tst         r3, #1
-            beq         5f
-            vst1.u8     {d31[0]}, [r0]!
-            vext.u8     d31, d31, d31, #1
-5:          mov         r0, #0
-.endm
-
-.irp r, TUNED_LIST1, 25
-PRIVATE(convolve1_\r)
-            push        {r12,lr}
-
-            prefill     step=1, max_r=\r, label=.Lcnv1_\r
-
-            conv_body   core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
-
-            pop         {r12,pc}
-END(convolve1_\r)
-.endr
-
-.irp r, TUNED_LIST4, 25
-PRIVATE(convolve4_\r)
-            push        {r12,lr}
-            sub         r9, sp, #0x200
-            sub         sp, sp, #0x200 + 0x400
-            bic         r9, r9, #0x3fc
-
-            /* r9 now points to a 0x200 byte buffer on the stack whose address
-             * has the low 10 bits clear.  This allows easy address calculation
-             * in the wrap-around cases.
-             */
-
-            prefill     step=4, max_r=\r, label=.Lcnv4_\r
-
-            conv_body   core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
-
-            add         sp, sp, #0x200 + 0x400
-            pop         {r12,pc}
-END(convolve4_\r)
-.endr
-
-/* void rsdIntrinsicBlurU1_K(
- *                  void *out,      // r0
- *                  void *in,       // r1
- *                  size_t w,       // r2
- *                  size_t h,       // r3
- *                  size_t p,       // [sp]
- *                  size_t x,       // [sp,#4]
- *                  size_t y,       // [sp,#8]
- *                  size_t count,   // [sp,#12]
- *                  size_t r,       // [sp,#16]
- *                  uint16_t *tab); // [sp,#20]
- */
-ENTRY(rsdIntrinsicBlurU1_K)
-            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-            vpush       {d8-d15}
-            ldr         r6, [sp,#112]   // y
-            ldr         r8, [sp,#108]   // x
-            ldr         r5, [sp,#120]   // r
-            sub         r4, r2, r8      // inlen = w - x
-            sub         r7, r3, r6      // h - y
-            ldr         r2, [sp,#104]   // pitch
-            ldr         r3, [sp,#116]   // count
-            sub         r7, r7, #1      // h - y - 1
-
-            ldr         r12, [sp,#124]
-
-            add         r1, r1, r8      // src += x
-
-            cmp         r6, r5
-            movhi       r6, r5          // rup = min(r, y)
-            cmp         r7, r5
-            movhi       r7, r5          // rdn = min(r, h - y - 1)
-
-            vld1.u16    {d0,d1,d2,d3}, [r12]!
-            vld1.u16    {d4,d5,d6}, [r12]!
-
-            adr         lr, 1f
-  .irp r, TUNED_LIST1
-            cmp         r5, #\r
-            bls         convolve1_\r
-  .endr
-            b           convolve1_25
-
-1:          vpop        {d8-d15}
-            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-END(rsdIntrinsicBlurU1_K)
-
-/* void rsdIntrinsicBlurU4_K(
- *                  void *out,      // r0
- *                  void *in,       // r1
- *                  size_t w,       // r2
- *                  size_t h,       // r3
- *                  size_t p,       // [sp]
- *                  size_t x,       // [sp,#4]
- *                  size_t y,       // [sp,#8]
- *                  size_t count,   // [sp,#12]
- *                  size_t r,       // [sp,#16]
- *                  uint16_t *tab); // [sp,#20]
- */
-ENTRY(rsdIntrinsicBlurU4_K)
-            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-            vpush       {d8-d15}
-            ldr         r6, [sp,#112]   // y
-            ldr         r8, [sp,#108]   // x
-            ldr         r5, [sp,#120]   // r
-            lsl         r8, r8, #2
-            rsb         r4, r8, r2, LSL #2 // inlen = (w - x)
-            sub         r7, r3, r6      // h - y
-            ldr         r2, [sp,#104]   // pitch
-            ldr         r3, [sp,#116]   // count
-            sub         r7, r7, #1      // h - y - 1
-            lsl         r3, r3, #2      // count
-
-            ldr         r12, [sp,#124]
-
-            add         r1, r1, r8      // in += x
-
-            cmp         r6, r5
-            movhi       r6, r5          // rup = min(r, y)
-            cmp         r7, r5
-            movhi       r7, r5          // rdn = min(r, h - y - 1)
-
-            vld1.u16    {d0,d1,d2,d3}, [r12]!
-            vld1.u16    {d4,d5,d6}, [r12]!
-
-            adr         lr, 1f
-  .irp r, TUNED_LIST4
-            cmp         r5, #\r
-            bls         convolve4_\r
-  .endr
-            b           convolve4_25
-
-1:          vpop        {d8-d15}
-            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-END(rsdIntrinsicBlurU4_K)

diff --git a/toolkit/ColorMatrix.cpp b/toolkit/ColorMatrix.cpp
deleted file mode 100644
index dd426cf..0000000
--- a/toolkit/ColorMatrix.cpp
+++ /dev/null

@@ -1,1066 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "RenderScriptToolkit.h"
-#include "TaskProcessor.h"
-#include "Utils.h"
-#include <assert.h>
-#include <cstdint>
-#include <sys/mman.h>
-
-namespace android {
-namespace renderscript {
-
-#define LOG_TAG "renderscript.toolkit.ColorMatrix"
-
-/*  uint kernel
- *  Q0  D0:  Load slot for R
- *      D1:  Load slot for G
- *  Q1  D2:  Load slot for B
- *      D3:  Load slot for A
- *  Q2  D4:  Matrix
- *      D5:  =
- *  Q3  D6:  =
- *      D7:  =
- *  Q4  D8:  Add R
- *      D9:
- *  Q5  D10: Add G
- *      D11:
- *  Q6  D12: Add B
- *      D13:
- *  Q7  D14: Add A
- *      D15:
- *  Q8  D16:  I32: R Sum
- *      D17:
- *  Q9  D18:  I32: G Sum
- *      D19:
- *  Q10 D20:  I32: B Sum
- *      D21:
- *  Q11 D22:  I32: A Sum
- *      D23:
- *  Q12 D24:  U16: expanded R
- *      D25:
- *  Q13 D26:  U16: expanded G
- *      D27:
- *  Q14 D28:  U16: expanded B
- *      D29:
- *  Q15 D30:  U16: expanded A
- *      D31:
- *
- */
-
-/*  float kernel
- *  Q0  D0:  Load slot for R
- *      D1:  =
- *  Q1  D2:  Load slot for G
- *      D3:  =
- *  Q2  D4:  Load slot for B
- *      D5:  =
- *  Q3  D6:  Load slot for A
- *      D7:  =
- *  Q4  D8:  Matrix
- *      D9:  =
- *  Q5  D10: =
- *      D11: =
- *  Q6  D12: =
- *      D13: =
- *  Q7  D14: =
- *      D15: =
- *  Q8  D16: Add R
- *      D17: =
- *  Q9  D18: Add G
- *      D19: =
- *  Q10 D20: Add B
- *      D21: =
- *  Q11 D22: Add A
- *      D23: =
- *  Q12 D24: Sum R
- *      D25: =
- *  Q13 D26: Sum G
- *      D27: =
- *  Q14 D28: Sum B
- *      D29: =
- *  Q15 D30: Sum A
- *      D31: =
- *
- */
-
-typedef union {
-    uint64_t key;
-    struct {
-        uint32_t inVecSize          :2;  // [0 - 1]
-        uint32_t outVecSize         :2;  // [2 - 3]
-        uint32_t inType             :4;  // [4 - 7]
-        uint32_t outType            :4;  // [8 - 11]
-        uint32_t dot                :1;  // [12]
-        uint32_t _unused1           :1;  // [13]
-        uint32_t copyAlpha          :1;  // [14]
-        uint32_t _unused2           :1;  // [15]
-        uint32_t coeffMask          :16; // [16-31]
-        uint32_t addMask            :4;  // [32-35]
-    } u;
-} Key_t;
-
-/* The two data types and their value, as specified in the RenderScript documentation.
- * Only RS_TYPE_UNSIGNED_8 is currently supported.
- *
- * TODO: The actual values of these constants are likely not important. We may be
- * able to simplify the key related code.
- */
-const int RS_TYPE_UNSIGNED_8 = 8;
-const int RS_TYPE_FLOAT_32 = 2;
-
-//Re-enable when intrinsic is fixed
-#if defined(ARCH_ARM64_USE_INTRINSICS)
-typedef struct {
-    void (*column[4])(void);
-    void (*store)(void);
-    void (*load)(void);
-    void (*store_end)(void);
-    void (*load_end)(void);
-} FunctionTab_t;
-
-extern "C" void rsdIntrinsicColorMatrix_int_K(
-             void *out, void const *in, size_t count,
-             FunctionTab_t const *fns,
-             int16_t const *mult, int32_t const *add);
-
-extern "C" void rsdIntrinsicColorMatrix_float_K(
-             void *out, void const *in, size_t count,
-             FunctionTab_t const *fns,
-             float const *mult, float const *add);
-
-/* The setup functions fill in function tables to be used by above functions;
- * this code also eliminates jump-to-another-jump cases by short-circuiting
- * empty functions.  While it's not performance critical, it works out easier
- * to write the set-up code in assembly than to try to expose the same symbols
- * and write the code in C.
- */
-extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
-             FunctionTab_t *fns,
-             uint32_t mask, int dt, int st);
-
-extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
-             FunctionTab_t *fns,
-             uint32_t mask, int dt, int st);
-#endif
-
-class ColorMatrixTask : public Task {
-    const void* mIn;
-    void* mOut;
-    size_t mInputVectorSize;
-    uint32_t mOutstep;
-    uint32_t mInstep;
-
-    float mFp[16];
-    float mFpa[4];
-
-    // The following four fields are read as constants
-    // by the SIMD assembly code.
-    int16_t mIp[16];
-    int mIpa[4];
-    float mTmpFp[16];
-    float mTmpFpa[4];
-#if defined(ARCH_ARM64_USE_INTRINSICS)
-    FunctionTab_t mFnTab;
-#endif
-
-    void kernel(uchar* out, uchar* in, uint32_t xstart, uint32_t xend);
-    void updateCoeffCache(float fpMul, float addMul);
-
-    Key_t mLastKey;
-    unsigned char* mBuf;
-    size_t mBufSize;
-
-    bool build(Key_t key);
-    void (*mOptKernel)(void* dst, const void* src, const int16_t* coef, uint32_t count);
-
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-    Key_t computeKey(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
-    void preLaunch(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
-#else
-    Key_t computeKey(size_t inVectorSize, size_t outVectorSize);
-    void preLaunch(size_t inVectorSize, size_t outVectorSize);
-#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-
-    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
-    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                             size_t endY) override;
-
-   public:
-    ColorMatrixTask(const void* in, void* out, size_t inputVectorSize, size_t outputVectorSize,
-                    size_t sizeX, size_t sizeY, const float* matrix, const float* addVector,
-                    const Restriction* restriction)
-        : Task{sizeX, sizeY, outputVectorSize, true, restriction},
-          mIn{in},
-          mOut{out},
-          mInputVectorSize{inputVectorSize} {
-        mLastKey.key = 0;
-        mBuf = nullptr;
-        mBufSize = 0;
-        mOptKernel = nullptr;
-
-        mOutstep = paddedSize(outputVectorSize);
-        mInstep = paddedSize(inputVectorSize);
-
-        memcpy(mFp, matrix, sizeof(mFp));
-        memcpy(mFpa, addVector, sizeof(mFpa));
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-        // For float support, we'll have to pass the type in the constructor too.
-        preLaunch(inputVectorSize, RS_TYPE_UNSIGNED_8, outputVectorSize, RS_TYPE_UNSIGNED_8);
-#else
-        preLaunch(inputVectorSize, outputVectorSize);
-#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-    }
-    ~ColorMatrixTask() {
-        if (mBuf) munmap(mBuf, mBufSize);
-        mBuf = nullptr;
-        mOptKernel = nullptr;
-    }
-};
-
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-Key_t ColorMatrixTask::computeKey(size_t inVectorSize, int inType, size_t outVectorSize,
-                                  int outType) {
-    Key_t key;
-    key.key = 0;
-
-    // Compute a unique code key for this operation
-
-    // Add to the key the input and output types
-    bool hasFloat = false;
-    if (inType == RS_TYPE_FLOAT_32) {
-        hasFloat = true;
-        key.u.inType = RS_TYPE_FLOAT_32;
-    }
-    if (outType == RS_TYPE_FLOAT_32) {
-        hasFloat = true;
-        key.u.outType = RS_TYPE_FLOAT_32;
-    }
-
-    // Mask in the bits indicating which coefficients in the
-    // color matrix are needed.
-    if (hasFloat) {
-        for (uint32_t i=0; i < 16; i++) {
-            if (fabs(mFp[i]) != 0.f) {
-                key.u.coeffMask |= 1 << i;
-            }
-        }
-        if (fabs(mFpa[0]) != 0.f) key.u.addMask |= 0x1;
-        if (fabs(mFpa[1]) != 0.f) key.u.addMask |= 0x2;
-        if (fabs(mFpa[2]) != 0.f) key.u.addMask |= 0x4;
-        if (fabs(mFpa[3]) != 0.f) key.u.addMask |= 0x8;
-
-    } else {
-#else
-Key_t ColorMatrixTask::computeKey(size_t inVectorSize, size_t outVectorSize) {
-    Key_t key;
-    key.key = 0;
-
-    // Compute a unique code key for this operation
-    {
-#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-
-        for (uint32_t i=0; i < 16; i++) {
-            if (mIp[i] != 0) {
-                key.u.coeffMask |= 1 << i;
-            }
-        }
-        if (mIpa[0] != 0) key.u.addMask |= 0x1;
-        if (mIpa[1] != 0) key.u.addMask |= 0x2;
-        if (mIpa[2] != 0) key.u.addMask |= 0x4;
-        if (mIpa[3] != 0) key.u.addMask |= 0x8;
-    }
-
-    // Look for a dot product where the r,g,b colums are the same
-    if ((mIp[0] == mIp[1]) && (mIp[0] == mIp[2]) &&
-        (mIp[4] == mIp[5]) && (mIp[4] == mIp[6]) &&
-        (mIp[8] == mIp[9]) && (mIp[8] == mIp[10]) &&
-        (mIp[12] == mIp[13]) && (mIp[12] == mIp[14])) {
-
-        if (!key.u.addMask) key.u.dot = 1;
-    }
-
-    // Is alpha a simple copy
-    if (!(key.u.coeffMask & 0x0888) && (mIp[15] == 256) && !(key.u.addMask & 0x8)) {
-        key.u.copyAlpha = !(key.u.inType || key.u.outType);
-    }
-
-    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
-
-    switch (inVectorSize) {
-    case 4:
-        key.u.inVecSize = 3;
-        break;
-    case 3:
-        key.u.inVecSize = 2;
-        key.u.coeffMask &= ~0xF000;
-        break;
-    case 2:
-        key.u.inVecSize = 1;
-        key.u.coeffMask &= ~0xFF00;
-        break;
-    default:
-        key.u.coeffMask &= ~0xFFF0;
-        break;
-    }
-
-    switch (outVectorSize) {
-    case 4:
-        key.u.outVecSize = 3;
-        break;
-    case 3:
-        key.u.outVecSize = 2;
-        key.u.coeffMask &= ~0x8888;
-        key.u.addMask &= 7;
-        break;
-    case 2:
-        key.u.outVecSize = 1;
-        key.u.coeffMask &= ~0xCCCC;
-        key.u.addMask &= 3;
-        break;
-    default:
-        key.u.coeffMask &= ~0xEEEE;
-        key.u.addMask &= 1;
-        break;
-    }
-
-    if (key.u.inType && !key.u.outType) {
-        key.u.addMask |= 1;
-        if (key.u.outVecSize > 0) key.u.addMask |= 2;
-        if (key.u.outVecSize > 1) key.u.addMask |= 4;
-        if (key.u.outVecSize > 2) key.u.addMask |= 8;
-    }
-
-    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
-    return key;
-}
-
-#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
-
-#define DEF_SYM(x)                                  \
-    extern "C" uint32_t _N_ColorMatrix_##x;      \
-    extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
-    extern "C" uint32_t _N_ColorMatrix_##x##_len;
-
-DEF_SYM(prefix_i)
-DEF_SYM(prefix_f)
-DEF_SYM(postfix1)
-DEF_SYM(postfix2)
-
-DEF_SYM(load_u8_4)
-DEF_SYM(load_u8_3)
-DEF_SYM(load_u8_2)
-DEF_SYM(load_u8_1)
-DEF_SYM(load_u8f_4)
-DEF_SYM(load_u8f_3)
-DEF_SYM(load_u8f_2)
-DEF_SYM(load_u8f_1)
-DEF_SYM(load_f32_4)
-DEF_SYM(load_f32_3)
-DEF_SYM(load_f32_2)
-DEF_SYM(load_f32_1)
-
-DEF_SYM(store_u8_4)
-DEF_SYM(store_u8_2)
-DEF_SYM(store_u8_1)
-DEF_SYM(store_f32_4)
-DEF_SYM(store_f32_3)
-DEF_SYM(store_f32_2)
-DEF_SYM(store_f32_1)
-DEF_SYM(store_f32u_4)
-DEF_SYM(store_f32u_2)
-DEF_SYM(store_f32u_1)
-
-DEF_SYM(unpack_u8_4)
-DEF_SYM(unpack_u8_3)
-DEF_SYM(unpack_u8_2)
-DEF_SYM(unpack_u8_1)
-DEF_SYM(pack_u8_4)
-DEF_SYM(pack_u8_3)
-DEF_SYM(pack_u8_2)
-DEF_SYM(pack_u8_1)
-DEF_SYM(dot)
-DEF_SYM(add_0_u8)
-DEF_SYM(add_1_u8)
-DEF_SYM(add_2_u8)
-DEF_SYM(add_3_u8)
-
-#define ADD_CHUNK(x) \
-    memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
-    buf += _N_ColorMatrix_##x##_len
-
-
-static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
-    size_t off = (target - buf - 8) >> 2;
-    assert(((off & 0xff000000) == 0) ||
-           ((off & 0xff000000) == 0xff000000));
-
-    uint32_t op = (condition << 28);
-    op |= 0xa << 24;  // branch
-    op |= 0xffffff & off;
-    ((uint32_t *)buf)[0] = op;
-    return buf + 4;
-}
-
-static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
-    assert(vd < 32);
-    assert(vm < 32);
-    assert(vn < 32);
-
-    uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
-    op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
-    op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
-    return op;
-}
-
-static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
-                              uint32_t src_d2_s) {
-    //vmlal.s16 Q#1, D#1, D#2[#]
-    uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
-    ((uint32_t *)buf)[0] = op;
-    return buf + 4;
-}
-
-static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
-                              uint32_t src_d2_s) {
-    //vmull.s16 Q#1, D#1, D#2[#]
-    uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
-    ((uint32_t *)buf)[0] = op;
-    return buf + 4;
-}
-
-static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
-    //vqadd.s32 Q#1, Q#1, Q#2
-    uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
-    ((uint32_t *)buf)[0] = op;
-    return buf + 4;
-}
-
-static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
-                              uint32_t src_d2_s) {
-    //vmlal.f32 Q#1, D#1, D#2[#]
-    uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
-    ((uint32_t *)buf)[0] = op;
-    return buf + 4;
-}
-
-static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
-                              uint32_t src_d2_s) {
-    //vmull.f32 Q#1, D#1, D#2[#]
-    uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
-    ((uint32_t *)buf)[0] = op;
-    return buf + 4;
-}
-
-static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
-    //vadd.f32 Q#1, D#1, D#2
-    uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
-    ((uint32_t *)buf)[0] = op;
-    return buf + 4;
-}
-
-static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
-    //vmov.32 Q#1, #imm
-    assert(imm == 0);
-    (void) imm; // Avoid unused parameter warnings for non-debug builds
-    uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
-    ((uint32_t *)buf)[0] = op;
-    return buf + 4;
-}
-
-static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
-    //vadd.f32 Q#1, D#1, D#2
-    uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
-    ((uint32_t *)buf)[0] = op;
-    return buf + 4;
-}
-#endif
-
-#if defined(ARCH_X86_HAVE_SSSE3)
-extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
-                                  const int16_t *coef, uint32_t count);
-extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
-                                  const int16_t *coef, uint32_t count);
-extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
-                                  const int16_t *coef, uint32_t count);
-
-using android::renderscript::Key_t;
-
-void * selectKernel(Key_t key)
-{
-    void * kernel = nullptr;
-
-    // inType, outType float if nonzero
-    if (!(key.u.inType || key.u.outType)) {
-        if (key.u.dot)
-            kernel = (void *)rsdIntrinsicColorMatrixDot_K;
-        else if (key.u.copyAlpha)
-            kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
-        else
-            kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
-    }
-
-    return kernel;
-}
-#endif
-
-bool ColorMatrixTask::build(Key_t key) {
-#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
-    mBufSize = 4096;
-    //StopWatch build_time("rs cm: build time");
-    mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
-                                  MAP_PRIVATE | MAP_ANON, -1, 0);
-    if (mBuf == MAP_FAILED) {
-        mBuf = NULL;
-        return false;
-    }
-
-    uint8_t *buf = mBuf;
-    uint8_t *buf2 = nullptr;
-
-    int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
-    int opInit[4] = {0, 0, 0, 0};
-
-    memset(ops, 0, sizeof(ops));
-    for (int i=0; i < 4; i++) {
-        if (key.u.coeffMask & (1 << (i*4))) {
-            ops[i][0] = 0x2 | opInit[0];
-            opInit[0] = 1;
-        }
-        if (!key.u.dot) {
-            if (key.u.coeffMask & (1 << (1 + i*4))) {
-                ops[i][1] = 0x2 | opInit[1];
-                opInit[1] = 1;
-            }
-            if (key.u.coeffMask & (1 << (2 + i*4))) {
-                ops[i][2] = 0x2 | opInit[2];
-                opInit[2] = 1;
-            }
-        }
-        if (!key.u.copyAlpha) {
-            if (key.u.coeffMask & (1 << (3 + i*4))) {
-                ops[i][3] = 0x2 | opInit[3];
-                opInit[3] = 1;
-            }
-        }
-    }
-
-    if (key.u.inType || key.u.outType) {
-        key.u.copyAlpha = 0;
-        ADD_CHUNK(prefix_f);
-        buf2 = buf;
-
-        // Load the incoming r,g,b,a as needed
-        if (key.u.inType) {
-            switch(key.u.inVecSize) {
-            case 3:
-                ADD_CHUNK(load_f32_4);
-                break;
-            case 2:
-                ADD_CHUNK(load_f32_3);
-                break;
-            case 1:
-                ADD_CHUNK(load_f32_2);
-                break;
-            case 0:
-                ADD_CHUNK(load_f32_1);
-                break;
-            }
-        } else {
-            switch(key.u.inVecSize) {
-            case 3:
-                ADD_CHUNK(load_u8f_4);
-                break;
-            case 2:
-                ADD_CHUNK(load_u8f_3);
-                break;
-            case 1:
-                ADD_CHUNK(load_u8f_2);
-                break;
-            case 0:
-                ADD_CHUNK(load_u8f_1);
-                break;
-            }
-        }
-
-        for (int i=0; i < 4; i++) {
-            for (int j=0; j < 4; j++) {
-                switch(ops[i][j]) {
-                case 0:
-                    break;
-                case 2:
-                    buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
-                    break;
-                case 3:
-                    buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
-                    break;
-                }
-            }
-        }
-        for (int j=0; j < 4; j++) {
-            if (opInit[j]) {
-                if (key.u.addMask & (1 << j)) {
-                    buf = addVADD_F32(buf, j, 12+j, 8+j);
-                } else {
-                    buf = addVORR_32(buf, j, 12+j, 12+j);
-                }
-            } else {
-                if (key.u.addMask & (1 << j)) {
-                    buf = addVORR_32(buf, j, 8+j, 8+j);
-                } else {
-                    buf = addVMOV_32(buf, j, 0);
-                }
-            }
-        }
-
-        if (key.u.outType) {
-            switch(key.u.outVecSize) {
-            case 3:
-                ADD_CHUNK(store_f32_4);
-                break;
-            case 2:
-                ADD_CHUNK(store_f32_3);
-                break;
-            case 1:
-                ADD_CHUNK(store_f32_2);
-                break;
-            case 0:
-                ADD_CHUNK(store_f32_1);
-                break;
-            }
-        } else {
-            switch(key.u.outVecSize) {
-            case 3:
-            case 2:
-                ADD_CHUNK(store_f32u_4);
-                break;
-            case 1:
-                ADD_CHUNK(store_f32u_2);
-                break;
-            case 0:
-                ADD_CHUNK(store_f32u_1);
-                break;
-            }
-        }
-
-
-    } else {
-        // Add the function prefix
-        // Store the address for the loop return
-        ADD_CHUNK(prefix_i);
-        buf2 = buf;
-
-        // Load the incoming r,g,b,a as needed
-        switch(key.u.inVecSize) {
-        case 3:
-            ADD_CHUNK(load_u8_4);
-            if (key.u.copyAlpha) {
-                ADD_CHUNK(unpack_u8_3);
-            } else {
-                ADD_CHUNK(unpack_u8_4);
-            }
-            break;
-        case 2:
-            ADD_CHUNK(load_u8_3);
-            ADD_CHUNK(unpack_u8_3);
-            break;
-        case 1:
-            ADD_CHUNK(load_u8_2);
-            ADD_CHUNK(unpack_u8_2);
-            break;
-        case 0:
-            ADD_CHUNK(load_u8_1);
-            ADD_CHUNK(unpack_u8_1);
-            break;
-        }
-
-        // Add multiply and accumulate
-        // use MULL to init the output register,
-        // use MLAL from there
-        for (int i=0; i < 4; i++) {
-            for (int j=0; j < 4; j++) {
-                switch(ops[i][j]) {
-                case 0:
-                    break;
-                case 2:
-                    buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
-                    break;
-                case 3:
-                    buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
-                    break;
-                }
-            }
-        }
-        for (int j=0; j < 4; j++) {
-            if (opInit[j]) {
-                if (key.u.addMask & (1 << j)) {
-                    buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
-                }
-            } else {
-                if (key.u.addMask & (1 << j)) {
-                    buf = addVORR_32(buf, 8+j, 4+j, 4+j);
-                }
-            }
-        }
-
-        // If we have a dot product, perform the special pack.
-        if (key.u.dot) {
-            ADD_CHUNK(pack_u8_1);
-            ADD_CHUNK(dot);
-        } else {
-            switch(key.u.outVecSize) {
-            case 3:
-                if (key.u.copyAlpha) {
-                    ADD_CHUNK(pack_u8_3);
-                } else {
-                    ADD_CHUNK(pack_u8_4);
-                }
-                break;
-            case 2:
-                ADD_CHUNK(pack_u8_3);
-                break;
-            case 1:
-                ADD_CHUNK(pack_u8_2);
-                break;
-            case 0:
-                ADD_CHUNK(pack_u8_1);
-                break;
-            }
-        }
-
-        // Write out result
-        switch(key.u.outVecSize) {
-        case 3:
-        case 2:
-            ADD_CHUNK(store_u8_4);
-            break;
-        case 1:
-            ADD_CHUNK(store_u8_2);
-            break;
-        case 0:
-            ADD_CHUNK(store_u8_1);
-            break;
-        }
-    }
-
-    if (key.u.inType != key.u.outType) {
-        key.u.copyAlpha = 0;
-        key.u.dot = 0;
-    }
-
-    // Loop, branch, and cleanup
-    ADD_CHUNK(postfix1);
-    buf = addBranch(buf, buf2, 0x01);
-    ADD_CHUNK(postfix2);
-
-    int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
-    if (ret == -1) {
-        ALOGE("mprotect error %i", ret);
-        return false;
-    }
-
-    __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
-    return true;
-#else
-    (void) key; // Avoid unused parameter warning.
-    return false;
-#endif
-}
-
-void ColorMatrixTask::updateCoeffCache(float fpMul, float addMul) {
-    for(int ct=0; ct < 16; ct++) {
-        mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
-        mTmpFp[ct] = mFp[ct] * fpMul;
-        //ALOGE("mat %i %f  %f", ct, mFp[ct], tmpFp[ct]);
-    }
-
-    float add = 0.f;
-    if (fpMul > 254.f) add = 0.5f;
-    for(int ct=0; ct < 4; ct++) {
-        mTmpFpa[ct] = mFpa[ct] * addMul + add;
-        //ALOGE("mFpa %i %f  %f", ct, mFpa[ct], tmpFpa[ct * 4 + 0]);
-    }
-
-    for(int ct=0; ct < 4; ct++) {
-        mIpa[ct] = (int)(mFpa[ct] * 65536.f + 0.5f);
-    }
-}
-
-
-
-static void One(void *out,
-                const void *py, const float* coeff, const float *add,
-                uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
-
-    float4 f = 0.f;
-    if (fin) {
-        switch(vsin) {
-        case 3:
-            f = ((const float4 *)py)[0];
-            break;
-        case 2:
-            f = ((const float4 *)py)[0];
-            f.w = 0.f;
-            break;
-        case 1:
-            f.xy = ((const float2 *)py)[0];
-            break;
-        case 0:
-            f.x = ((const float *)py)[0];
-            break;
-        }
-    } else {
-        switch(vsin) {
-        case 3:
-            f = convert<float4>(((const uchar4 *)py)[0]);
-            break;
-        case 2:
-            f = convert<float4>(((const uchar4 *)py)[0]);
-            f.w = 0.f;
-            break;
-        case 1:
-            f.xy = convert<float2>(((const uchar2 *)py)[0]);
-            break;
-        case 0:
-            f.x = (float)(((const uchar *)py)[0]);
-            break;
-        }
-    }
-    //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
-
-    float4 sum;
-    sum.x = f.x * coeff[0] +
-            f.y * coeff[4] +
-            f.z * coeff[8] +
-            f.w * coeff[12];
-    sum.y = f.x * coeff[1] +
-            f.y * coeff[5] +
-            f.z * coeff[9] +
-            f.w * coeff[13];
-    sum.z = f.x * coeff[2] +
-            f.y * coeff[6] +
-            f.z * coeff[10] +
-            f.w * coeff[14];
-    sum.w = f.x * coeff[3] +
-            f.y * coeff[7] +
-            f.z * coeff[11] +
-            f.w * coeff[15];
-    //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
-
-    sum.x += add[0];
-    sum.y += add[1];
-    sum.z += add[2];
-    sum.w += add[3];
-
-
-    //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
-    if (fout) {
-        switch(vsout) {
-        case 3:
-        case 2:
-            ((float4 *)out)[0] = sum;
-            break;
-        case 1:
-            ((float2 *)out)[0] = sum.xy;
-            break;
-        case 0:
-            ((float *)out)[0] = sum.x;
-            break;
-        }
-    } else {
-        sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
-        sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
-        sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
-        sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
-
-        switch(vsout) {
-        case 3:
-        case 2:
-            ((uchar4 *)out)[0] = convert<uchar4>(sum);
-            break;
-        case 1:
-            ((uchar2 *)out)[0] = convert<uchar2>(sum.xy);
-            break;
-        case 0:
-            ((uchar *)out)[0] = sum.x;
-            break;
-        }
-    }
-    //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2],
-    //      ((float *)out)[3]);
-}
-
-void ColorMatrixTask::kernel(uchar *out, uchar *in, uint32_t xstart, uint32_t xend) {
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-    uint32_t vsin = mLastKey.u.inVecSize;
-    uint32_t vsout = mLastKey.u.outVecSize;
-    bool floatIn = !!mLastKey.u.inType;
-    bool floatOut = !!mLastKey.u.outType;
-
-    //if (!info->current.y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
-
-    if(x2 > x1) {
-        int32_t len = x2 - x1;
-        if (mUsesSimd) {
-            if((mOptKernel != nullptr) && (len >= 4)) {
-                // The optimized kernel processes 4 pixels at once
-                // and requires a minimum of 1 chunk of 4
-                mOptKernel(out, in, mIp, len >> 2);
-                // Update the len and pointers so the generic code can
-                // finish any leftover pixels
-                len &= ~3;
-                x1 += len;
-                out += mOutstep * len;
-                in += mInstep * len;
-            }
-#if defined(ARCH_ARM64_USE_INTRINSICS)
-            else {
-                if (mLastKey.u.inType == RS_TYPE_FLOAT_32 ||
-                    mLastKey.u.outType == RS_TYPE_FLOAT_32) {
-                    // Currently this generates off by one errors.
-                    // rsdIntrinsicColorMatrix_float_K(out, in, len, &mFnTab, tmpFp, tmpFpa);
-                    // x1 += len;
-                    // out += outstep * len;
-                    // in += instep * len;
-                } else {
-                    rsdIntrinsicColorMatrix_int_K(out, in, len, &mFnTab, mIp, mIpa);
-                    x1 += len;
-                    out += mOutstep * len;
-                    in += mInstep * len;
-                }
-            }
-#endif
-        }
-
-        while(x1 != x2) {
-            One(out, in, mTmpFp, mTmpFpa, vsin, vsout, floatIn, floatOut);
-            out += mOutstep;
-            in += mInstep;
-            x1++;
-        }
-    }
-}
-
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-void ColorMatrixTask::preLaunch(size_t inVectorSize, int inType, size_t outVectorSize,
-                                int outType) {
-    if (inType == outType) {
-        if (outType == RS_TYPE_UNSIGNED_8) {
-            updateCoeffCache(1.f, 255.f);
-        } else {
-            updateCoeffCache(1.f, 1.f);
-        }
-    } else {
-        if (outType == RS_TYPE_UNSIGNED_8) {
-            updateCoeffCache(255.f, 255.f);
-        } else {
-            updateCoeffCache(1.f / 255.f, 1.f);
-        }
-    }
-
-    Key_t key = computeKey(inVectorSize, inType, outVectorSize, outType);
-#else
-void ColorMatrixTask::preLaunch(size_t inVectorSize, size_t outVectorSize) {
-    updateCoeffCache(1.f, 255.f);
-
-    Key_t key = computeKey(inVectorSize, outVectorSize);
-#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-
-#if defined(ARCH_X86_HAVE_SSSE3)
-    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
-        // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
-        // mOptKernel =
-        //     (void (*)(void *, const void *, const int16_t *, uint32_t)) selectKernel(key);
-        mLastKey = key;
-    }
-
-#else //if !defined(ARCH_X86_HAVE_SSSE3)
-    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
-        if (mBuf) munmap(mBuf, mBufSize);
-        mBuf = nullptr;
-        mOptKernel = nullptr;
-        if (build(key)) {
-            mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) mBuf;
-        }
-#if defined(ARCH_ARM64_USE_INTRINSICS)
-        else {
-            int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
-            int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
-            uint32_t mm = 0;
-            int i;
-            for (i = 0; i < 4; i++)
-            {
-                uint32_t m = (key.u.coeffMask >> i) & 0x1111;
-                m = ((m * 0x249) >> 9) & 15;
-                m |= ((key.u.addMask >> i) & 1) << 4;
-                mm |= m << (i * 5);
-            }
-
-            if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
-                rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
-            } else {
-                rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
-            }
-        }
-#endif
-        mLastKey = key;
-    }
-#endif //if !defined(ARCH_X86_HAVE_SSSE3)
-}
-
-void ColorMatrixTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
-                                  size_t endY) {
-    for (size_t y = startY; y < endY; y++) {
-        size_t offset = mSizeX * y + startX;
-        uchar* in = ((uchar*)mIn) + offset * paddedSize(mInputVectorSize);
-        uchar* out = ((uchar*)mOut) + offset * paddedSize(mVectorSize);
-        kernel(out, in, startX, endX);
-    }
-}
-
-static const float fourZeroes[]{0.0f, 0.0f, 0.0f, 0.0f};
-
-void RenderScriptToolkit::colorMatrix(const void* in, void* out, size_t inputVectorSize,
-                                      size_t outputVectorSize, size_t sizeX, size_t sizeY,
-                                      const float* matrix, const float* addVector,
-                                      const Restriction* restriction) {
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
-    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
-        return;
-    }
-    if (inputVectorSize < 1 || inputVectorSize > 4) {
-        ALOGE("The inputVectorSize should be between 1 and 4. %zu provided.", inputVectorSize);
-        return;
-    }
-    if (outputVectorSize < 1 || outputVectorSize > 4) {
-        ALOGE("The outputVectorSize should be between 1 and 4. %zu provided.", outputVectorSize);
-        return;
-    }
-#endif
-
-    if (addVector == nullptr) {
-        addVector = fourZeroes;
-    }
-    ColorMatrixTask task(in, out, inputVectorSize, outputVectorSize, sizeX, sizeY, matrix,
-                         addVector, restriction);
-    processor->doTask(&task);
-}
-
-}  // namespace renderscript
-}  // namespace android

diff --git a/toolkit/ColorMatrix_advsimd.S b/toolkit/ColorMatrix_advsimd.S
deleted file mode 100644
index 55b0029..0000000
--- a/toolkit/ColorMatrix_advsimd.S
+++ /dev/null

@@ -1,1277 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
-#define END(f) .size f, .-f;
-
-
-.macro vmxx_f32 i, mask, opd, opa, opb
-  .if (\i) & \mask
-    .if (\i) & (\mask - 1)
-        fmla            \opd, \opa, \opb
-    .else
-        fmul            \opd, \opa, \opb
-    .endif
-  .endif
-.endm
-
-.macro vadd_f32 i, mask, opd, opa, opb, stupidsyntax1, stupidsyntax2
-  .if (\i) & \mask
-    .if (\i) & (\mask - 1)
-        fadd            \opd, \opa, \opb
-    .else
-        mov             \stupidsyntax1, \stupidsyntax2
-    .endif
-  .endif
-.endm
-
-.macro vmxx_s16 i, mask, opd, opa, opb
-  .if (\i) & \mask
-    .if (\i) & (\mask - 1 + 16)
-        smlal           \opd, \opa, \opb
-    .else
-        smull           \opd, \opa, \opb
-    .endif
-  .endif
-.endm
-
-.macro vmxx2_s16 i, mask, opd, opa, opb
-  .if (\i) & \mask
-    .if (\i) & (\mask - 1 + 16)
-        smlal2          \opd, \opa, \opb
-    .else
-        smull2          \opd, \opa, \opb
-    .endif
-  .endif
-.endm
-
-/* x0 = dst
- * x1 = src
- * x2 = count
- * x3 = params
- * x4 = column0_fn
- * x5 = column1_fn
- * x6 = column2_fn
- * x7 = column3_fn
- * x8 = store_fn
- * x9 = load_fn
- */
-.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-
-.align 6
-colormatrix_int_col0_\i:
-      .if \i & 16
-            dup         v6.4s, v4.s[0]
-            dup         v7.4s, v4.s[0]
-      .endif
-            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[0]
-            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[4]
-            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[0]
-            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[4]
-            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[0]
-            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[4]
-            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[0]
-            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[4]
-            sqshrun     v8.4h, v6.4s, #8
-            sqshrun2    v8.8h, v7.4s, #8
-            br          x5
-
-colormatrix_int_col0_n\i:
-      .if (\i^31) & 16
-            dup         v6.4s, v4.s[0]
-            dup         v7.4s, v4.s[0]
-      .endif
-            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[0]
-            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[4]
-            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[0]
-            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[4]
-            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[0]
-            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[4]
-            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[0]
-            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[4]
-            sqshrun     v8.4h, v6.4s, #8
-            sqshrun2    v8.8h, v7.4s, #8
-            br          x5
-
-.align 6
-colormatrix_int_col1_\i:
-      .if \i & 16
-            dup         v6.4s, v4.s[1]
-            dup         v7.4s, v4.s[1]
-      .endif
-            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[1]
-            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[5]
-            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[1]
-            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[5]
-            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[1]
-            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[5]
-            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[1]
-            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[5]
-            sqshrun     v9.4h, v6.4s, #8
-            sqshrun2    v9.8h, v7.4s, #8
-            br          x6
-
-colormatrix_int_col1_n\i:
-      .if (\i^31) & 16
-            dup         v6.4s, v4.s[1]
-            dup         v7.4s, v4.s[1]
-      .endif
-            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[1]
-            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[5]
-            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[1]
-            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[5]
-            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[1]
-            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[5]
-            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[1]
-            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[5]
-            sqshrun     v9.4h, v6.4s, #8
-            sqshrun2    v9.8h, v7.4s, #8
-            br          x6
-
-.align 6
-colormatrix_int_col2_\i:
-      .if \i & 16
-            dup         v6.4s, v4.s[2]
-            dup         v7.4s, v4.s[2]
-      .endif
-            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[2]
-            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[6]
-            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[2]
-            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[6]
-            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[2]
-            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[6]
-            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[2]
-            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[6]
-            sqshrun     v10.4h, v6.4s, #8
-            sqshrun2    v10.8h, v7.4s, #8
-            br          x7
-
-colormatrix_int_col2_n\i:
-      .if (\i^31) & 16
-            dup         v6.4s, v4.s[2]
-            dup         v7.4s, v4.s[2]
-      .endif
-            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[2]
-            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[6]
-            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[2]
-            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[6]
-            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[2]
-            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[6]
-            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[2]
-            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[6]
-            sqshrun     v10.4h, v6.4s, #8
-            sqshrun2    v10.8h, v7.4s, #8
-            br          x7
-
-.align 6
-colormatrix_int_col3_\i:
-      .if \i & 16
-            dup         v6.4s, v4.s[3]
-            dup         v7.4s, v4.s[3]
-      .endif
-            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[3]
-            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[7]
-            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[3]
-            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[7]
-            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[3]
-            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[7]
-            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[3]
-            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[7]
-            sqshrun     v11.4h, v6.4s, #8
-            sqshrun2    v11.8h, v7.4s, #8
-            br          x8
-
-colormatrix_int_col3_n\i:
-      .if (\i^31) & 16
-            dup         v6.4s, v4.s[3]
-            dup         v7.4s, v4.s[3]
-      .endif
-            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[3]
-            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[7]
-            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[3]
-            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[7]
-            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[3]
-            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[7]
-            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[3]
-            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[7]
-            sqshrun     v11.4h, v6.4s, #8
-            sqshrun2    v11.8h, v7.4s, #8
-            br          x8
-
-.align 5
-colormatrix_float_col0_\i:
-            vmxx_f32    \i, 1,  v8.4s, v12.4s, v0.s[0]
-            vmxx_f32    \i, 2,  v8.4s, v13.4s, v1.s[0]
-            vmxx_f32    \i, 4,  v8.4s, v14.4s, v2.s[0]
-            vmxx_f32    \i, 8,  v8.4s, v15.4s, v3.s[0]
-            vadd_f32    \i, 16, v8.4s, v8.4s, v4.4s,        v8.16b, v4.16b
-            vmxx_f32    \i, 1,  v16.4s, v20.4s, v0.s[0]
-            vmxx_f32    \i, 2,  v16.4s, v21.4s, v1.s[0]
-            vmxx_f32    \i, 4,  v16.4s, v22.4s, v2.s[0]
-            vmxx_f32    \i, 8,  v16.4s, v23.4s, v3.s[0]
-            vadd_f32    \i, 16, v16.4s, v16.4s, v4.4s,      v16.16b, v4.16b
-            br          x5
-
-.align 4
-colormatrix_float_col0_n\i:
-            vmxx_f32    \i^31, 1,  v8.4s, v12.4s, v0.s[0]
-            vmxx_f32    \i^31, 2,  v8.4s, v13.4s, v1.s[0]
-            vmxx_f32    \i^31, 4,  v8.4s, v14.4s, v2.s[0]
-            vmxx_f32    \i^31, 8,  v8.4s, v15.4s, v3.s[0]
-            vadd_f32    \i^31, 16, v8.4s, v8.4s, v4.4s,     v8.16b, v4.16b
-            vmxx_f32    \i^31, 1,  v16.4s, v20.4s, v0.s[0]
-            vmxx_f32    \i^31, 2,  v16.4s, v21.4s, v1.s[0]
-            vmxx_f32    \i^31, 4,  v16.4s, v22.4s, v2.s[0]
-            vmxx_f32    \i^31, 8,  v16.4s, v23.4s, v3.s[0]
-            vadd_f32    \i^31, 16, v16.4s, v16.4s, v4.4s,   v16.16b, v4.16b
-            br          x5
-
-.align 5
-colormatrix_float_col1_\i:
-            vmxx_f32    \i, 1,  v9.4s, v12.4s, v0.s[1]
-            vmxx_f32    \i, 2,  v9.4s, v13.4s, v1.s[1]
-            vmxx_f32    \i, 4,  v9.4s, v14.4s, v2.s[1]
-            vmxx_f32    \i, 8,  v9.4s, v15.4s, v3.s[1]
-            vadd_f32    \i, 16, v9.4s, v9.4s, v5.4s,        v9.16b, v5.16b
-            vmxx_f32    \i, 1,  v17.4s, v20.4s, v0.s[1]
-            vmxx_f32    \i, 2,  v17.4s, v21.4s, v1.s[1]
-            vmxx_f32    \i, 4,  v17.4s, v22.4s, v2.s[1]
-            vmxx_f32    \i, 8,  v17.4s, v23.4s, v3.s[1]
-            vadd_f32    \i, 16, v17.4s, v17.4s, v5.4s,      v17.16b, v5.16b
-            br          x6
-
-.align 4
-colormatrix_float_col1_n\i:
-            vmxx_f32    \i^31, 1,  v9.4s, v12.4s, v0.s[1]
-            vmxx_f32    \i^31, 2,  v9.4s, v13.4s, v1.s[1]
-            vmxx_f32    \i^31, 4,  v9.4s, v14.4s, v2.s[1]
-            vmxx_f32    \i^31, 8,  v9.4s, v15.4s, v3.s[1]
-            vadd_f32    \i^31, 16, v9.4s, v9.4s, v5.4s,     v9.16b, v5.16b
-            vmxx_f32    \i^31, 1,  v17.4s, v20.4s, v0.s[1]
-            vmxx_f32    \i^31, 2,  v17.4s, v21.4s, v1.s[1]
-            vmxx_f32    \i^31, 4,  v17.4s, v22.4s, v2.s[1]
-            vmxx_f32    \i^31, 8,  v17.4s, v23.4s, v3.s[1]
-            vadd_f32    \i^31, 16, v17.4s, v17.4s, v5.4s,   v17.16b, v5.16b
-            br          x6
-
-.align 5
-colormatrix_float_col2_\i:
-            vmxx_f32    \i, 1,  v10.4s, v12.4s, v0.s[2]
-            vmxx_f32    \i, 2,  v10.4s, v13.4s, v1.s[2]
-            vmxx_f32    \i, 4,  v10.4s, v14.4s, v2.s[2]
-            vmxx_f32    \i, 8,  v10.4s, v15.4s, v3.s[2]
-            vadd_f32    \i, 16, v10.4s, v10.4s, v6.4s,      v10.16b, v6.16b
-            vmxx_f32    \i, 1,  v18.4s, v20.4s, v0.s[2]
-            vmxx_f32    \i, 2,  v18.4s, v21.4s, v1.s[2]
-            vmxx_f32    \i, 4,  v18.4s, v22.4s, v2.s[2]
-            vmxx_f32    \i, 8,  v18.4s, v23.4s, v3.s[2]
-            vadd_f32    \i, 16, v18.4s, v18.4s, v6.4s,      v18.16b, v6.16b
-            br          x7
-
-.align 4
-colormatrix_float_col2_n\i:
-            vmxx_f32    \i^31, 1,  v10.4s, v12.4s, v0.s[2]
-            vmxx_f32    \i^31, 2,  v10.4s, v13.4s, v1.s[2]
-            vmxx_f32    \i^31, 4,  v10.4s, v14.4s, v2.s[2]
-            vmxx_f32    \i^31, 8,  v10.4s, v15.4s, v3.s[2]
-            vadd_f32    \i^31, 16, v10.4s, v10.4s, v6.4s,   v10.16b, v6.16b
-            vmxx_f32    \i^31, 1,  v18.4s, v20.4s, v0.s[2]
-            vmxx_f32    \i^31, 2,  v18.4s, v21.4s, v1.s[2]
-            vmxx_f32    \i^31, 4,  v18.4s, v22.4s, v2.s[2]
-            vmxx_f32    \i^31, 8,  v18.4s, v23.4s, v3.s[2]
-            vadd_f32    \i^31, 16, v18.4s, v18.4s, v6.4s,   v18.16b, v6.16b
-            br          x7
-
-.align 5
-colormatrix_float_col3_\i:
-            vmxx_f32    \i, 1,  v11.4s, v12.4s, v0.s[3]
-            vmxx_f32    \i, 2,  v11.4s, v13.4s, v1.s[3]
-            vmxx_f32    \i, 4,  v11.4s, v14.4s, v2.s[3]
-            vmxx_f32    \i, 8,  v11.4s, v15.4s, v3.s[3]
-            vadd_f32    \i, 16, v11.4s, v11.4s, v7.4s,      v11.16b, v7.16b
-            vmxx_f32    \i, 1,  v19.4s, v20.4s, v0.s[3]
-            vmxx_f32    \i, 2,  v19.4s, v21.4s, v1.s[3]
-            vmxx_f32    \i, 4,  v19.4s, v22.4s, v2.s[3]
-            vmxx_f32    \i, 8,  v19.4s, v23.4s, v3.s[3]
-            vadd_f32    \i, 16, v19.4s, v19.4s, v7.4s,      v19.16b, v7.16b
-            br          x8
-
-.align 4
-colormatrix_float_col3_n\i:
-            vmxx_f32    \i^31, 1,  v11.4s, v12.4s, v0.s[3]
-            vmxx_f32    \i^31, 2,  v11.4s, v13.4s, v1.s[3]
-            vmxx_f32    \i^31, 4,  v11.4s, v14.4s, v2.s[3]
-            vmxx_f32    \i^31, 8,  v11.4s, v15.4s, v3.s[3]
-            vadd_f32    \i^31, 16, v11.4s, v11.4s, v7.4s,  v11.16b, v7.16b
-            vmxx_f32    \i^31, 1,  v19.4s, v20.4s, v0.s[3]
-            vmxx_f32    \i^31, 2,  v19.4s, v21.4s, v1.s[3]
-            vmxx_f32    \i^31, 4,  v19.4s, v22.4s, v2.s[3]
-            vmxx_f32    \i^31, 8,  v19.4s, v23.4s, v3.s[3]
-            vadd_f32    \i^31, 16, v19.4s, v19.4s, v7.4s,  v19.16b, v7.16b
-            br          x8
-
-.endr
-
-.align 6
-colormatrix_float_ldu4:
-            ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
-            uxtl        v20.8h, v20.8b
-            uxtl        v21.8h, v21.8b
-            uxtl        v22.8h, v22.8b
-            uxtl        v23.8h, v23.8b
-            uxtl        v12.4s, v20.4h
-            uxtl        v13.4s, v21.4h
-            uxtl        v14.4s, v22.4h
-            uxtl        v15.4s, v23.4h
-            uxtl2       v20.4s, v20.8h
-            uxtl2       v21.4s, v21.8h
-            uxtl2       v22.4s, v22.8h
-            uxtl2       v23.4s, v23.8h
-            ucvtf       v12.4s, v12.4s
-            ucvtf       v13.4s, v13.4s
-            ucvtf       v14.4s, v14.4s
-            ucvtf       v15.4s, v15.4s
-            ucvtf       v20.4s, v20.4s
-            ucvtf       v21.4s, v21.4s
-            ucvtf       v22.4s, v22.4s
-            ucvtf       v23.4s, v23.4s
-            br          x4
-
-.align 5
-colormatrix_int_ldu4:
-            ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
-            uxtl        v12.8h, v12.8b
-            uxtl        v13.8h, v13.8b
-            uxtl        v14.8h, v14.8b
-            uxtl        v15.8h, v15.8b
-            br          x4
-
-.align 6
-colormatrix_float_ldu3:
-            ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
-            uxtl        v20.8h, v20.8b
-            uxtl        v21.8h, v21.8b
-            uxtl        v22.8h, v22.8b
-            uxtl        v12.4s, v20.4h
-            uxtl        v13.4s, v21.4h
-            uxtl        v14.4s, v22.4h
-            uxtl2       v20.4s, v20.8h
-            uxtl2       v21.4s, v21.8h
-            uxtl2       v22.4s, v22.8h
-            ucvtf       v12.4s, v12.4s
-            ucvtf       v13.4s, v13.4s
-            ucvtf       v14.4s, v14.4s
-            ucvtf       v20.4s, v20.4s
-            ucvtf       v21.4s, v21.4s
-            ucvtf       v22.4s, v22.4s
-            br          x4
-
-colormatrix_int_ldu3:
-            ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
-            uxtl        v12.8h, v12.8b
-            uxtl        v13.8h, v13.8b
-            uxtl        v14.8h, v14.8b
-            br          x4
-
-.align 5
-colormatrix_float_ldu1:
-            ld1         {v20.8b}, [x1], #8
-            uxtl        v20.8h, v20.8b
-            uxtl        v12.4s, v20.4h
-            uxtl2       v20.4s, v20.8h
-            ucvtf       v12.4s, v12.4s
-            ucvtf       v20.4s, v20.4s
-            br          x4
-
-.align 6
-colormatrix_float_ldu2:
-            ld2         {v20.8b,v21.8b}, [x1], #16
-            uxtl        v20.8h, v20.8b
-            uxtl        v21.8h, v21.8b
-            uxtl        v12.4s, v20.4h
-            uxtl        v13.4s, v21.4h
-            uxtl2       v20.4s, v20.8h
-            uxtl2       v21.4s, v21.8h
-            ucvtf       v12.4s, v12.4s
-            ucvtf       v13.4s, v13.4s
-            ucvtf       v20.4s, v20.4s
-            ucvtf       v21.4s, v21.4s
-            br          x4
-
-.align 4
-colormatrix_int_ldu2:
-            ld2         {v12.8b,v13.8b}, [x1], #16
-            uxtl        v12.8h, v12.8b
-            uxtl        v13.8h, v13.8b
-            br          x4
-
-.align 6
-colormatrix_float_stu4:
-            fcvtzs      v24.4s, v8.4s, #1
-            fcvtzs      v25.4s, v9.4s, #1
-            fcvtzs      v26.4s, v10.4s, #1
-            fcvtzs      v27.4s, v11.4s, #1
-            fcvtzs      v28.4s, v16.4s, #1
-            fcvtzs      v29.4s, v17.4s, #1
-            fcvtzs      v30.4s, v18.4s, #1
-            fcvtzs      v31.4s, v19.4s, #1
-            sqrshrun    v24.4h, v24.4s, #1
-            sqrshrun    v25.4h, v25.4s, #1
-            sqrshrun    v26.4h, v26.4s, #1
-            sqrshrun    v27.4h, v27.4s, #1
-            sqrshrun2   v24.8h, v28.4s, #1
-            sqrshrun2   v25.8h, v29.4s, #1
-            sqrshrun2   v26.8h, v30.4s, #1
-            sqrshrun2   v27.8h, v31.4s, #1
-            uqxtn       v24.8b, v24.8h
-            uqxtn       v25.8b, v25.8h
-            uqxtn       v26.8b, v26.8h
-            uqxtn       v27.8b, v27.8h
-            subs        x2, x2, #8
-            st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
-            blo         colormatrix_float_end
-            br          x9
-
-.align 5
-colormatrix_int_stu4:
-            uqxtn       v12.8b, v8.8h
-            uqxtn       v13.8b, v9.8h
-            uqxtn       v14.8b, v10.8h
-            uqxtn       v15.8b, v11.8h
-            subs        x2, x2, #8
-            st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
-            blo         colormatrix_int_end
-            br          x9
-
-.align 6
-colormatrix_float_stu3:
-            fcvtzs      v24.4s, v8.4s, #1
-            fcvtzs      v25.4s, v9.4s, #1
-            fcvtzs      v26.4s, v10.4s, #1
-            fcvtzs      v28.4s, v16.4s, #1
-            fcvtzs      v29.4s, v17.4s, #1
-            fcvtzs      v30.4s, v18.4s, #1
-            sqrshrun    v24.4h, v24.4s, #1
-            sqrshrun    v25.4h, v25.4s, #1
-            sqrshrun    v26.4h, v26.4s, #1
-            sqrshrun2   v24.8h, v28.4s, #1
-            sqrshrun2   v25.8h, v29.4s, #1
-            sqrshrun2   v26.8h, v30.4s, #1
-            uqxtn       v24.8b, v24.8h
-            uqxtn       v25.8b, v25.8h
-            uqxtn       v26.8b, v26.8h
-            movi        v27.8b, #0
-            subs        x2, x2, #8
-            st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
-            blo         colormatrix_float_end
-            br          x9
-
-.align 4
-colormatrix_int_ldu1:
-            ld1         {v12.8b}, [x1], #8
-            uxtl        v12.8h, v12.8b
-            br          x4
-
-.align 5
-colormatrix_int_stu3:
-            uqxtn       v12.8b, v8.8h
-            uqxtn       v13.8b, v9.8h
-            uqxtn       v14.8b, v10.8h
-            movi        v15.8b, #0
-            subs        x2, x2, #8
-            st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
-            blo         colormatrix_int_end
-            br          x9
-
-.align 6
-colormatrix_float_stu2:
-            fcvtzs      v24.4s, v8.4s, #1
-            fcvtzs      v25.4s, v9.4s, #1
-            fcvtzs      v28.4s, v16.4s, #1
-            fcvtzs      v29.4s, v17.4s, #1
-            sqrshrun    v24.4h, v24.4s, #1
-            sqrshrun    v25.4h, v25.4s, #1
-            sqrshrun2   v24.8h, v28.4s, #1
-            sqrshrun2   v25.8h, v29.4s, #1
-            uqxtn       v24.8b, v24.8h
-            uqxtn       v25.8b, v25.8h
-            subs        x2, x2, #8
-            st2         {v24.8b,v25.8b}, [x0], #16
-            blo         colormatrix_float_end
-            br          x9
-
-.align 5
-colormatrix_int_stu2:
-            uqxtn       v12.8b, v8.8h
-            uqxtn       v13.8b, v9.8h
-            subs        x2, x2, #8
-            st2         {v12.8b,v13.8b}, [x0], #16
-            blo         colormatrix_int_end
-            br          x9
-
-.align 5
-colormatrix_int_stu1:
-            uqxtn       v12.8b, v8.8h
-            subs        x2, x2, #8
-            st1         {v12.8b}, [x0], #8
-            blo         colormatrix_int_end
-            br          x9
-
-colormatrix_float_ldf3:
-            ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
-            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
-            br          x4
-
-.align 6
-colormatrix_float_stu1:
-            fcvtzs      v24.4s, v8.4s, #1
-            fcvtzs      v28.4s, v16.4s, #1
-            sqrshrun    v24.4h, v24.4s, #1
-            sqrshrun2   v24.8h, v28.4s, #1
-            uqxtn       v24.8b, v24.8h
-            subs        x2, x2, #8
-            st1         {v24.8b}, [x0], #8
-            blo         colormatrix_float_end
-            br          x9
-
-colormatrix_float_stf3:
-            movi        v11.16b, #0
-            st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
-            movi        v19.16b, #0
-            subs        x2, x2, #8
-            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
-            blo         colormatrix_float_end
-            br          x9
-
-.align 5
-colormatrix_float_stf4:
-            st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
-            subs        x2, x2, #8
-            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
-            blo         colormatrix_float_end
-            br          x9
-
-colormatrix_float_ldf4:
-            ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
-            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
-            br          x4
-
-.align 5
-colormatrix_float_stf2:
-            st2         {v8.4s, v9.4s}, [x0], #32
-            subs        x2, x2, #8
-            st2         {v16.4s, v17.4s}, [x0], #32
-            blo         colormatrix_float_end
-            br          x9
-
-colormatrix_float_ldf2:
-            ld2         {v12.4s,v13.4s}, [x1], #32
-            ld2         {v20.4s,v21.4s}, [x1], #32
-            br          x4
-
-.align 5
-colormatrix_float_stf1:
-            st1         {v8.4s}, [x0], #16
-            subs        x2, x2, #8
-            st1         {v16.4s}, [x0], #16
-            blo         colormatrix_float_end
-            br          x9
-
-colormatrix_float_ldf1:
-            ld1         {v12.4s}, [x1], #16
-            ld1         {v20.4s}, [x1], #16
-            br          x4
-
-colormatrix_int_stu1_end:
-            uqxtn       v12.8b, v8.8h
-            tbz         x2, #2, 1f
-            st1         {v12.s}[1], [x0], #4
-1:          tbz         x2, #1, 1f
-            st1         {v12.h}[1], [x0], #2
-1:          tbz         x2, #0, 1f
-            st1         {v12.b}[1], [x0], #1
-1:          b           colormatrix_int_realend
-
-colormatrix_int_stu2_end:
-            uqxtn       v12.8b, v8.8h
-            uqxtn       v13.8b, v9.8h
-            zip1        v12.16b, v12.16b, v13.16b
-            tbz         x2, #2, 1f
-            st1         {v12.d}[1], [x0], #8
-1:          tbz         x2, #1, 1f
-            st1         {v12.s}[1], [x0], #4
-1:          tbz         x2, #0, 1f
-            st1         {v12.h}[1], [x0], #2
-1:          b           colormatrix_int_realend
-
-colormatrix_int_stu3_end:
-            uqxtn       v12.8b, v8.8h
-            uqxtn       v13.8b, v9.8h
-            uqxtn       v14.8b, v10.8h
-            movi        v15.8b, #0
-            tbz         x2, #2, 1f
-            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
-1:          tbz         x2, #1, 1f
-            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
-1:          tbz         x2, #0, 1f
-            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
-1:          b           colormatrix_int_realend
-
-colormatrix_int_stu4_end:
-            uqxtn       v12.8b, v8.8h
-            uqxtn       v13.8b, v9.8h
-            uqxtn       v14.8b, v10.8h
-            uqxtn       v15.8b, v11.8h
-            tbz         x2, #2, 1f
-            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
-1:          tbz         x2, #1, 1f
-            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
-1:          tbz         x2, #0, 1f
-            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
-1:          b           colormatrix_int_realend
-
-
-colormatrix_int_ldu1_end:
-            tbz         x2, #2, 1f
-            ld1         {v15.s}[3], [x1], #4
-1:          tbz         x2, #1, 1f
-            ld1         {v15.h}[5], [x1], #2
-1:          tbz         x2, #0, 1f
-            ld1         {v15.b}[9], [x1], #1
-1:          uxtl2       v12.8h, v15.16b
-            br          x4
-
-colormatrix_int_ldu2_end:
-            tbz         x2, #2, 1f
-            ld1         {v15.d}[1], [x1], #8
-1:          tbz         x2, #1, 1f
-            ld1         {v15.s}[1], [x1], #4
-1:          tbz         x2, #0, 1f
-            ld1         {v15.h}[1], [x1], #2
-1:          uzp1        v14.16b, v15.16b, v15.16b
-            uzp2        v15.16b, v15.16b, v15.16b
-            uxtl        v12.8h, v14.8b
-            uxtl        v13.8h, v15.8b
-            br          x4
-
-colormatrix_int_ldu3_end:
-            tbz         x2, #2, 1f
-            ld4         {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
-            ld4         {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
-            ld4         {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
-            ld4         {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
-1:          tbz         x2, #1, 1f
-            ld4         {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
-            ld4         {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
-1:          tbz         x2, #0, 1f
-            ld4         {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
-1:          uxtl        v12.8h, v12.8b
-            uxtl        v13.8h, v13.8b
-            uxtl        v14.8h, v14.8b
-            br          x4
-
-colormatrix_int_ldu4_end:
-            tbz         x2, #2, 1f
-            ld4         {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
-            ld4         {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
-            ld4         {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
-            ld4         {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
-1:          tbz         x2, #1, 1f
-            ld4         {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
-            ld4         {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
-1:          tbz         x2, #0, 1f
-            ld4         {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
-1:          uxtl        v12.8h, v12.8b
-            uxtl        v13.8h, v13.8b
-            uxtl        v14.8h, v14.8b
-            uxtl        v15.8h, v15.8b
-            br          x4
-
-colormatrix_float_stu1_end:
-            fcvtzs      v12.4s, v8.4s, #1
-            fcvtzs      v13.4s, v16.4s, #1
-            sqrshrun    v12.4h, v12.4s, #1
-            sqrshrun2   v12.8h, v13.4s, #1
-            uqxtn       v12.8b, v12.8h
-            tbz         x2, #2, 1f
-            st1         {v12.s}[1], [x0], #4
-1:          tbz         x2, #1, 1f
-            st1         {v12.h}[1], [x0], #2
-1:          tbz         x2, #0, 1f
-            st1         {v12.b}[1], [x0], #1
-1:          b           colormatrix_float_realend
-
-colormatrix_float_stu2_end:
-            fcvtzs      v12.4s, v8.4s, #1
-            fcvtzs      v13.4s, v9.4s, #1
-            fcvtzs      v14.4s, v16.4s, #1
-            fcvtzs      v15.4s, v17.4s, #1
-            sqrshrun    v12.4h, v12.4s, #1
-            sqrshrun    v13.4h, v13.4s, #1
-            sqrshrun    v14.4h, v14.4s, #1
-            sqrshrun    v15.4h, v15.4s, #1
-            zip1        v12.8h, v12.8h, v13.8h
-            zip1        v13.8h, v14.8h, v15.8h
-            uqxtn       v12.8b, v12.8h
-            uqxtn2      v12.16b, v13.8h
-            tbz         x2, #2, 1f
-            st1         {v12.d}[1], [x0], #8
-1:          tbz         x2, #1, 1f
-            st1         {v12.s}[1], [x0], #4
-1:          tbz         x2, #0, 1f
-            st1         {v12.h}[1], [x0], #2
-1:          b           colormatrix_float_realend
-
-colormatrix_float_stu3_end:
-            fcvtzs      v24.4s, v8.4s, #1
-            fcvtzs      v25.4s, v9.4s, #1
-            fcvtzs      v26.4s, v10.4s, #1
-            fcvtzs      v28.4s, v16.4s, #1
-            fcvtzs      v29.4s, v17.4s, #1
-            fcvtzs      v30.4s, v18.4s, #1
-            sqrshrun    v24.4h, v24.4s, #1
-            sqrshrun    v25.4h, v25.4s, #1
-            sqrshrun    v26.4h, v26.4s, #1
-            sqrshrun2   v24.8h, v28.4s, #1
-            sqrshrun2   v25.8h, v29.4s, #1
-            sqrshrun2   v26.8h, v30.4s, #1
-            uqxtn       v12.8b, v24.8h
-            uqxtn       v13.8b, v25.8h
-            uqxtn       v14.8b, v26.8h
-            movi        v15.8b, #0
-            tbz         x2, #2, 1f
-            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
-1:          tbz         x2, #1, 1f
-            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
-1:          tbz         x2, #0, 1f
-            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
-1:          b           colormatrix_float_realend
-
-colormatrix_float_stu4_end:
-            fcvtzs      v24.4s, v8.4s, #1
-            fcvtzs      v25.4s, v9.4s, #1
-            fcvtzs      v26.4s, v10.4s, #1
-            fcvtzs      v27.4s, v11.4s, #1
-            fcvtzs      v28.4s, v16.4s, #1
-            fcvtzs      v29.4s, v17.4s, #1
-            fcvtzs      v30.4s, v18.4s, #1
-            fcvtzs      v31.4s, v19.4s, #1
-            sqrshrun    v24.4h, v24.4s, #1
-            sqrshrun    v25.4h, v25.4s, #1
-            sqrshrun    v26.4h, v26.4s, #1
-            sqrshrun    v27.4h, v27.4s, #1
-            sqrshrun2   v24.8h, v28.4s, #1
-            sqrshrun2   v25.8h, v29.4s, #1
-            sqrshrun2   v26.8h, v30.4s, #1
-            sqrshrun2   v27.8h, v31.4s, #1
-            uqxtn       v12.8b, v24.8h
-            uqxtn       v13.8b, v25.8h
-            uqxtn       v14.8b, v26.8h
-            uqxtn       v15.8b, v27.8h
-            tbz         x2, #2, 1f
-            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
-1:          tbz         x2, #1, 1f
-            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
-            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
-1:          tbz         x2, #0, 1f
-            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
-1:          b           colormatrix_float_realend
-
-colormatrix_float_stf1_end:
-            tbz         x2, #2, 1f
-            st1         {v16.4s}, [x0], #16
-1:          tbz         x2, #1, 1f
-            st1         {v8.d}[1], [x0], #8
-1:          tbz         x2, #0, 1f
-            st1         {v8.s}[1], [x0], #4
-1:          b           colormatrix_float_realend
-
-colormatrix_float_stf2_end:
-            tbz         x2, #2, 1f
-            st2         {v16.4s, v17.4s}, [x0], #32
-1:          tbz         x2, #1, 1f
-            st2         {v8.s,v9.s}[2], [x0], #8
-            st2         {v8.s,v9.s}[3], [x0], #8
-1:          tbz         x2, #0, 1f
-            st2         {v8.s,v9.s}[1], [x0], #8
-1:          b           colormatrix_float_realend
-
-colormatrix_float_stf3_end:
-            movi        v11.16b, #0
-            movi        v19.16b, #0
-colormatrix_float_stf4_end:
-            tbz         x2, #2, 1f
-            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
-1:          tbz         x2, #1, 1f
-            st4         {v8.s,v9.s,v10.s,v11.s}[2], [x0], #16
-            st4         {v8.s,v9.s,v10.s,v11.s}[3], [x0], #16
-1:          tbz         x2, #0, 1f
-            st4         {v8.s,v9.s,v10.s,v11.s}[1], [x0], #16
-1:          b           colormatrix_float_realend
-
-colormatrix_float_ldu1_end:
-            tbz         x2, #2, 1f
-            ld1         {v15.s}[1], [x1], #4
-1:          tbz         x2, #1, 1f
-            ld1         {v15.h}[1], [x1], #2
-1:          tbz         x2, #0, 1f
-            ld1         {v15.b}[1], [x1], #1
-1:          uxtl        v15.8h, v15.8b
-            uxtl        v12.4s, v15.4h
-            uxtl2       v20.4s, v15.8h
-            ucvtf       v12.4s, v12.4s
-            ucvtf       v20.4s, v20.4s
-            br          x4
-
-colormatrix_float_ldu2_end:
-            tbz         x2, #2, 1f
-            ld1         {v15.d}[1], [x1], #8
-1:          tbz         x2, #1, 1f
-            ld1         {v15.s}[1], [x1], #4
-1:          tbz         x2, #0, 1f
-            ld1         {v15.h}[1], [x1], #2
-1:          uxtl        v14.8h, v15.8b
-            uxtl2       v15.8h, v15.16b
-            uzp1        v12.8h, v14.8h, v14.8h
-            uzp2        v13.8h, v14.8h, v14.8h
-            uzp1        v20.8h, v15.8h, v15.8h
-            uzp2        v21.8h, v15.8h, v15.8h
-            uxtl        v12.4s, v12.4h
-            uxtl        v13.4s, v13.4h
-            uxtl        v20.4s, v20.4h
-            uxtl        v21.4s, v21.4h
-            ucvtf       v12.4s, v12.4s
-            ucvtf       v13.4s, v13.4s
-            ucvtf       v20.4s, v20.4s
-            ucvtf       v21.4s, v21.4s
-            br          x4
-
-colormatrix_float_ldu3_end:
-            tbz         x2, #2, 1f
-            ld4         {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
-            ld4         {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
-            ld4         {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
-            ld4         {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
-1:          tbz         x2, #1, 1f
-            ld4         {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
-            ld4         {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
-1:          tbz         x2, #0, 1f
-            ld4         {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
-1:          uxtl        v20.8h, v20.8b
-            uxtl        v21.8h, v21.8b
-            uxtl        v22.8h, v22.8b
-            uxtl        v12.4s, v20.4h
-            uxtl        v13.4s, v21.4h
-            uxtl        v14.4s, v22.4h
-            uxtl2       v20.4s, v20.8h
-            uxtl2       v21.4s, v21.8h
-            uxtl2       v22.4s, v22.8h
-            ucvtf       v12.4s, v12.4s
-            ucvtf       v13.4s, v13.4s
-            ucvtf       v14.4s, v14.4s
-            ucvtf       v20.4s, v20.4s
-            ucvtf       v21.4s, v21.4s
-            ucvtf       v22.4s, v22.4s
-            br          x4
-
-colormatrix_float_ldu4_end:
-            tbz         x2, #2, 1f
-            ld4         {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
-            ld4         {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
-            ld4         {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
-            ld4         {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
-1:          tbz         x2, #1, 1f
-            ld4         {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
-            ld4         {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
-1:          tbz         x2, #0, 1f
-            ld4         {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
-1:          uxtl        v20.8h, v20.8b
-            uxtl        v21.8h, v21.8b
-            uxtl        v22.8h, v22.8b
-            uxtl        v23.8h, v23.8b
-            uxtl        v12.4s, v20.4h
-            uxtl        v13.4s, v21.4h
-            uxtl        v14.4s, v22.4h
-            uxtl        v15.4s, v23.4h
-            uxtl2       v20.4s, v20.8h
-            uxtl2       v21.4s, v21.8h
-            uxtl2       v22.4s, v22.8h
-            uxtl2       v23.4s, v23.8h
-            ucvtf       v12.4s, v12.4s
-            ucvtf       v13.4s, v13.4s
-            ucvtf       v14.4s, v14.4s
-            ucvtf       v15.4s, v15.4s
-            ucvtf       v20.4s, v20.4s
-            ucvtf       v21.4s, v21.4s
-            ucvtf       v22.4s, v22.4s
-            ucvtf       v23.4s, v23.4s
-            br          x4
-
-colormatrix_float_ldf1_end:
-            tbz         x2, #2, 1f
-            ld1         {v20.4s}, [x1], #16
-1:          tbz         x2, #1, 1f
-            ld1         {v12.d}[1], [x1], #8
-1:          tbz         x2, #0, 1f
-            ld1         {v12.s}[1], [x1], #4
-1:          br          x4
-
-colormatrix_float_ldf2_end:
-            tbz         x2, #2, 1f
-            ld2         {v20.4s,v21.4s}, [x1], #32
-1:          tbz         x2, #1, 1f
-            ld2         {v12.s,v13.s}[2], [x1], #8
-            ld2         {v12.s,v13.s}[3], [x1], #8
-1:          tbz         x2, #0, 1f
-            ld2         {v12.s,v13.s}[1], [x1], #8
-1:          br          x4
-
-colormatrix_float_ldf3_end:
-colormatrix_float_ldf4_end:
-            tbz         x2, #2, 1f
-            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
-1:          tbz         x2, #1, 1f
-            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x1], #16
-            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x1], #16
-1:          tbz         x2, #0, 1f
-            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x1], #16
-1:          br          x4
-
-/* void rsdIntrinsicColorMatrix_int_K(
- *          void *out,              // x0
- *          void const *in,         // x1
- *          size_t count,           // x2
- *          fntab_t const *fns,     // x3
- *          int16_t const *mult,    // x4
- *          int32_t const *add);    // x5
- */
-ENTRY(rsdIntrinsicColorMatrix_int_K)
-            sub         x7, sp, #32
-            sub         sp, sp, #64
-            st1         {v8.1d-v11.1d}, [sp]
-            st1         {v12.1d-v15.1d}, [x7]
-
-            ld1         {v0.8h,v1.8h}, [x4], #32
-            ld1         {v4.4s}, [x5], #16
-
-            ldp         x4,x5, [x3],#16
-            ldp         x6,x7, [x3],#16
-            ldp         x8,x9, [x3],#16
-
-            dup         v12.4s, v4.s[0]
-            dup         v13.4s, v4.s[1]
-            dup         v14.4s, v4.s[2]
-            dup         v15.4s, v4.s[3]
-            sqshrun     v8.4h, v12.4s, #8
-            sqshrun2    v8.8h, v12.4s, #8
-            sqshrun     v9.4h, v13.4s, #8
-            sqshrun2    v9.8h, v13.4s, #8
-            sqshrun     v10.4h, v14.4s, #8
-            sqshrun2    v10.8h, v14.4s, #8
-            sqshrun     v11.4h, v15.4s, #8
-            sqshrun2    v11.8h, v15.4s, #8
-
-            subs        x2, x2, #8
-            blo         colormatrix_int_end
-            br          x9
-
-colormatrix_int_end:
-            adds        x2, x2, #8
-            bls         colormatrix_int_realend
-            mov         x16, x8
-            ldp         x8, x9, [x3], #16
-            cmp         x4, x16
-            csel        x4, x8, x4, eq
-            cmp         x5, x16
-            csel        x5, x8, x5, eq
-            cmp         x6, x16
-            csel        x6, x8, x6, eq
-            cmp         x7, x16
-            csel        x7, x8, x7, eq
-            br          x9
-
-colormatrix_int_realend:
-            ld1         {v8.1d-v11.1d}, [sp], #32
-            ld1         {v12.1d-v15.1d}, [sp], #32
-            ret
-END(rsdIntrinsicColorMatrix_int_K)
-
-/* void rsdIntrinsicColorMatrixSetup_int_K(
- *          fntab_t const *fns, // x0
- *          uint32_t mask,      // x1
- *          int dt,             // x2
- *          int st);            // x3
- */
-ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
-            adrp        x7, 2f
-            add         x7, x7, :lo12:2f
-            add         x4, x7, x2, LSL #3
-            ldrsw       x2, [x4], #4
-            ldrsw       x4, [x4]
-            add         x2, x2, x7
-            add         x4, x4, x7
-            adrp        x7, 3f
-            add         x7, x7, :lo12:3f
-            add         x5, x7, x3, LSL #3
-            ldrsw       x3, [x5], #4
-            ldrsw       x5, [x5]
-            add         x3, x3, x7
-            add         x5, x5, x7
-            stp         x2, x3, [x0, #32]
-            stp         x4, x5, [x0, #48]
-
-/* For each column function, if the matrix is all zeroes then write NULL,
- * otherwise look up the appropriate function and store that. */
-
-            mov         x3, #4
-            adrp        x7, 4f
-            add         x7, x7, :lo12:4f
-1:          ands        x2, x1, #15
-            beq         9f
-            and         x2, x1, #31
-            lsl         x2, x2, #4
-            ldrsw       x2, [x7, x2]
-            add         x2, x2, x7
-9:          str         x2, [x0], #8
-            lsr         x1, x1, #5
-            add         x7, x7, #4
-            subs        x3, x3, #1
-            bne         1b
-
-/* For every NULL entry, copy the non-NULL entry that follows it, or the store
- * function. */
-
-            ldr         x2, [x0]
-            mov         x3, #4
-1:          ldr         x1, [x0, #-8]!
-            cmp         x1, #0
-            csel        x2, x1, x2, ne
-            str         x2, [x0]
-            subs        x3, x3, #1
-            bne         1b
-            ret
-
-END(rsdIntrinsicColorMatrixSetup_int_K)
-.rodata
-            .align 4
-2:          .word      colormatrix_int_stu1-2b
-            .word      colormatrix_int_stu1_end-2b
-            .word      colormatrix_int_stu2-2b
-            .word      colormatrix_int_stu2_end-2b
-            .word      colormatrix_int_stu3-2b
-            .word      colormatrix_int_stu3_end-2b
-            .word      colormatrix_int_stu4-2b
-            .word      colormatrix_int_stu4_end-2b
-3:          .word      colormatrix_int_ldu1-3b
-            .word      colormatrix_int_ldu1_end-3b
-            .word      colormatrix_int_ldu2-3b
-            .word      colormatrix_int_ldu2_end-3b
-            .word      colormatrix_int_ldu3-3b
-            .word      colormatrix_int_ldu3_end-3b
-            .word      colormatrix_int_ldu4-3b
-            .word      colormatrix_int_ldu4_end-3b
-4:
-.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-            .word      colormatrix_int_col0_\i-4b
-            .word      colormatrix_int_col1_\i-4b-4
-            .word      colormatrix_int_col2_\i-4b-8
-            .word      colormatrix_int_col3_\i-4b-12
-.endr
-.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-            .word      colormatrix_int_col0_n\i-4b
-            .word      colormatrix_int_col1_n\i-4b-4
-            .word      colormatrix_int_col2_n\i-4b-8
-            .word      colormatrix_int_col3_n\i-4b-12
-.endr
-
-
-/* void rsdIntrinsicColorMatrix_float_K(
- *          void *out,              // x0
- *          void const *in,         // x1
- *          size_t count,           // x2
- *          fntab_t const *fns,     // x3
- *          float const *mult,      // x4
- *          float const *add);      // x5
- */
-ENTRY(rsdIntrinsicColorMatrix_float_K)
-            sub         x7, sp, #32
-            sub         sp, sp, #64
-            st1         {v8.1d-v11.1d}, [sp]
-            st1         {v12.1d-v15.1d}, [x7]
-
-            ld1         {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64
-            ld1r        {v4.4s}, [x5], #4
-            ld1r        {v5.4s}, [x5], #4
-            ld1r        {v6.4s}, [x5], #4
-            ld1r        {v7.4s}, [x5], #4
-
-            ldp         x4,x5, [x3], #16
-            ldp         x6,x7, [x3], #16
-            ldp         x8,x9, [x3], #16
-
-            mov         v8.16b, v4.16b
-            mov         v9.16b, v5.16b
-            mov         v10.16b, v6.16b
-            mov         v11.16b, v7.16b
-
-            mov         v16.16b, v4.16b
-            mov         v17.16b, v5.16b
-            mov         v18.16b, v6.16b
-            mov         v19.16b, v7.16b
-
-            subs        x2, x2, #8
-            blo         colormatrix_float_end
-            br          x9
-
-colormatrix_float_end:
-            adds        x2, x2, #8
-            bls         colormatrix_int_realend
-            mov         x16, x8
-            ldp         x8,x9, [x3], #16
-            cmp         x4, x16
-            csel        x4, x8, x4, eq
-            cmp         x5, x16
-            csel        x5, x8, x5, eq
-            cmp         x6, x16
-            csel        x6, x8, x6, eq
-            cmp         x7, x16
-            csel        x7, x8, x7, eq
-            br          x9
-
-colormatrix_float_realend:
-            ld1         {v8.1d-v11.1d}, [sp], #32
-            ld1         {v12.1d-v15.1d}, [sp], #32
-            ret
-END(rsdIntrinsicColorMatrix_float_K)
-
-/* void rsdIntrinsicColorMatrixSetup_float_K(
- *          fntab_t const *fns, // x0
- *          uint32_t mask,      // x1
- *          int dt,             // x2
- *          int st);            // x3
- */
-ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
-            adrp        x7, 2f
-            add         x7, x7, :lo12:2f
-            add         x4, x7, x2, LSL #3
-            ldrsw       x2, [x4], #4
-            ldrsw       x4, [x4]
-            add         x2, x2, x7
-            add         x4, x4, x7
-            adrp        x7, 3f
-            add         x7, x7, :lo12:3f
-            add         x5, x7, x3, LSL #3
-            ldrsw       x3, [x5], #4
-            ldrsw       x5, [x5]
-            add         x3, x3, x7
-            add         x5, x5, x7
-            stp         x2, x3, [x0, #32]
-            stp         x4, x5, [x0, #48]
-
-/* For each column function, if the matrix is all zeroes then write NULL,
- * otherwise look up the appropriate function and store that. */
-
-            mov         x3, #4
-            adrp        x7, 4f
-            add         x7, x7, :lo12:4f
-1:          ands        x2, x1, #15
-            beq         9f
-            and         x2, x1, #31
-            lsl         x2, x2, #4
-            ldrsw       x2, [x7, x2]
-            add         x2, x2, x7
-9:          str         x2, [x0], #8
-            lsr         x1, x1, #5
-            add         x7, x7, #4
-            subs        x3, x3, #1
-            bne         1b
-
-/* For every NULL entry, copy the non-NULL entry that follows it, or the store
- * function. */
-
-            ldr         x2, [x0]
-            mov         x3, #4
-1:          ldr         x1, [x0, #-8]!
-            cmp         x1, #0
-            csel        x2, x1, x2, ne
-            str         x2, [x0]
-            subs        x3, x3, #1
-            bne         1b
-            ret
-
-END(rsdIntrinsicColorMatrixSetup_float_K)
-.rodata
-            .align 4
-2:          .word      colormatrix_float_stu1-2b
-            .word      colormatrix_float_stu1_end-2b
-            .word      colormatrix_float_stu2-2b
-            .word      colormatrix_float_stu2_end-2b
-            .word      colormatrix_float_stu3-2b
-            .word      colormatrix_float_stu3_end-2b
-            .word      colormatrix_float_stu4-2b
-            .word      colormatrix_float_stu4_end-2b
-            .word      colormatrix_float_stf1-2b
-            .word      colormatrix_float_stf1_end-2b
-            .word      colormatrix_float_stf2-2b
-            .word      colormatrix_float_stf2_end-2b
-            .word      colormatrix_float_stf3-2b
-            .word      colormatrix_float_stf3_end-2b
-            .word      colormatrix_float_stf4-2b
-            .word      colormatrix_float_stf4_end-2b
-3:          .word      colormatrix_float_ldu1-3b
-            .word      colormatrix_float_ldu1_end-3b
-            .word      colormatrix_float_ldu2-3b
-            .word      colormatrix_float_ldu2_end-3b
-            .word      colormatrix_float_ldu3-3b
-            .word      colormatrix_float_ldu3_end-3b
-            .word      colormatrix_float_ldu4-3b
-            .word      colormatrix_float_ldu4_end-3b
-            .word      colormatrix_float_ldf1-3b
-            .word      colormatrix_float_ldf1_end-3b
-            .word      colormatrix_float_ldf2-3b
-            .word      colormatrix_float_ldf2_end-3b
-            .word      colormatrix_float_ldf3-3b
-            .word      colormatrix_float_ldf3_end-3b
-            .word      colormatrix_float_ldf4-3b
-            .word      colormatrix_float_ldf4_end-3b
-4:
-.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-            .word      colormatrix_float_col0_\i-4b
-            .word      colormatrix_float_col1_\i-4b-4
-            .word      colormatrix_float_col2_\i-4b-8
-            .word      colormatrix_float_col3_\i-4b-12
-.endr
-.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-            .word      colormatrix_float_col0_n\i-4b
-            .word      colormatrix_float_col1_n\i-4b-4
-            .word      colormatrix_float_col2_n\i-4b-8
-            .word      colormatrix_float_col3_n\i-4b-12
-.endr

diff --git a/toolkit/ColorMatrix_neon.S b/toolkit/ColorMatrix_neon.S
deleted file mode 100644
index ecb8c13..0000000
--- a/toolkit/ColorMatrix_neon.S
+++ /dev/null

@@ -1,361 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define SNIP_START(x) \
-    .globl x; x:
-
-#define SNIP_END(x) \
-    .globl x##_end; x##_end: \
-    .globl x##_len; x##_len: \
-    .word x##_end-x
-
-SNIP_START(_N_ColorMatrix_prefix_i)
-    stmfd           sp!, {r4, lr}
-    vpush           {q4-q7}
-    vld1.16 {q2}, [r2]!
-    vld1.16 {q3}, [r2]!
-    vld1.32 {d8[],d9[]}, [r2]!
-    vld1.32 {d10[],d11[]}, [r2]!
-    vld1.32 {d12[],d13[]}, [r2]!
-    vld1.32 {d14[],d15[]}, [r2]!
-    veor q0, q0
-    veor q1, q1
-    veor q9, q9
-    veor q10, q10
-    veor q11, q11
-SNIP_END(_N_ColorMatrix_prefix_i)
-
-SNIP_START(_N_ColorMatrix_prefix_f)
-    stmfd           sp!, {r4, lr}
-    vpush           {q4-q7}
-    add r2, #48
-    vld1.32 {q4}, [r2]!
-    vld1.32 {q5}, [r2]!
-    vld1.32 {q6}, [r2]!
-    vld1.32 {q7}, [r2]!
-    vld1.32 {d16[],d17[]}, [r2]!
-    vld1.32 {d18[],d19[]}, [r2]!
-    vld1.32 {d20[],d21[]}, [r2]!
-    vld1.32 {d22[],d23[]}, [r2]!
-    veor q1, q1
-    veor q2, q2
-    veor q3, q3
-SNIP_END(_N_ColorMatrix_prefix_f)
-
-SNIP_START(_N_ColorMatrix_postfix1)
-    subs r3, r3, #1
-    #bne 1b
-SNIP_END(_N_ColorMatrix_postfix1)
-
-SNIP_START(_N_ColorMatrix_postfix2)
-
-    #mov r0, #0
-    #ldr r0, [r0]
-
-    #vqadd.s32 q0,q0,q0
-    #vadd.f32 q0,q0,q0
-    #vmul.f32 q0,q0,d0[0]
-    #vmla.f32 q0,q0,d0[0]
-    #vmov q0, q0
-
-
-    vpop            {q4-q7}
-    ldmfd           sp!, {r4, lr}
-    bx              lr
-SNIP_END(_N_ColorMatrix_postfix2)
-
-SNIP_START(_N_ColorMatrix_load_u8_4)
-    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
-    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
-    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
-    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
-SNIP_END(_N_ColorMatrix_load_u8_4)
-
-SNIP_START(_N_ColorMatrix_load_u8_3)
-    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
-    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
-    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
-    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
-    veor d3, d3
-SNIP_END(_N_ColorMatrix_load_u8_3)
-
-SNIP_START(_N_ColorMatrix_load_u8_2)
-    vld2.8 {d0[0],d1[0]}, [r1]!
-    vld2.8 {d0[1],d1[1]}, [r1]!
-    vld2.8 {d0[2],d1[2]}, [r1]!
-    vld2.8 {d0[3],d1[3]}, [r1]!
-    veor d2, d2
-    veor d3, d3
-SNIP_END(_N_ColorMatrix_load_u8_2)
-
-SNIP_START(_N_ColorMatrix_load_u8_1)
-    vld1.32 {d0[0]}, [r1]!
-    veor d1, d1
-    veor d2, d2
-    veor d3, d3
-SNIP_END(_N_ColorMatrix_load_u8_1)
-
-SNIP_START(_N_ColorMatrix_load_u8f_4)
-    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
-    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
-    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
-    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
-    vmovl.u8 q3, d3
-    vmovl.u8 q2, d2
-    vmovl.u8 q1, d1
-    vmovl.u8 q0, d0
-    vmovl.u16 q3, d6
-    vmovl.u16 q2, d4
-    vmovl.u16 q1, d2
-    vmovl.u16 q0, d0
-    vcvt.f32.s32 q3, q3
-    vcvt.f32.s32 q2, q2
-    vcvt.f32.s32 q1, q1
-    vcvt.f32.s32 q0, q0
-SNIP_END(_N_ColorMatrix_load_u8f_4)
-
-SNIP_START(_N_ColorMatrix_load_u8f_3)
-    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
-    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
-    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
-    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
-    vmovl.u8 q2, d2
-    vmovl.u8 q1, d1
-    vmovl.u8 q0, d0
-    vmovl.u16 q2, d4
-    vmovl.u16 q1, d2
-    vmovl.u16 q0, d0
-    vcvt.f32.s32 q2, q2
-    vcvt.f32.s32 q1, q1
-    vcvt.f32.s32 q0, q0
-    veor q3, q3
-SNIP_END(_N_ColorMatrix_load_u8f_3)
-
-SNIP_START(_N_ColorMatrix_load_u8f_2)
-    vld2.8 {d0[0],d1[0]}, [r1]!
-    vld2.8 {d0[1],d1[1]}, [r1]!
-    vld2.8 {d0[2],d1[2]}, [r1]!
-    vld2.8 {d0[3],d1[3]}, [r1]!
-    vmovl.u8 q1, d1
-    vmovl.u8 q0, d0
-    vmovl.u16 q1, d2
-    vmovl.u16 q0, d0
-    vcvt.f32.s32 q1, q1
-    vcvt.f32.s32 q0, q0
-    veor q2, q2
-    veor q3, q3
-SNIP_END(_N_ColorMatrix_load_u8f_2)
-
-SNIP_START(_N_ColorMatrix_load_u8f_1)
-    vld1.32 {d0[0]}, [r1]!
-    vmovl.u8 q0, d0
-    vmovl.u16 q0, d0
-    vcvt.f32.s32 q0, q0
-    veor q1, q1
-    veor q2, q2
-    veor q3, q3
-SNIP_END(_N_ColorMatrix_load_u8f_1)
-
-SNIP_START(_N_ColorMatrix_load_f32_4)
-    vld4.32 {d0[0],d2[0],d4[0],d6[0]}, [r1]!
-    vld4.32 {d0[1],d2[1],d4[1],d6[1]}, [r1]!
-    vld4.32 {d1[0],d3[0],d5[0],d7[0]}, [r1]!
-    vld4.32 {d1[1],d3[1],d5[1],d7[1]}, [r1]!
-SNIP_END(_N_ColorMatrix_load_f32_4)
-
-SNIP_START(_N_ColorMatrix_load_f32_3)
-    vld3.32 {d0[0],d2[0],d4[0]}, [r1]!
-    add r1, r1, #4
-    vld3.32 {d0[1],d2[1],d4[1]}, [r1]!
-    add r1, r1, #4
-    vld3.32 {d1[0],d3[0],d5[0]}, [r1]!
-    add r1, r1, #4
-    vld3.32 {d1[1],d3[1],d5[1]}, [r1]!
-    add r1, r1, #4
-    veor q3, q3
-SNIP_END(_N_ColorMatrix_load_f32_3)
-
-SNIP_START(_N_ColorMatrix_load_f32_2)
-    vld2.32 {d0[0],d2[0]}, [r1]!
-    vld2.32 {d0[1],d2[1]}, [r1]!
-    vld2.32 {d1[0],d3[0]}, [r1]!
-    vld2.32 {d1[1],d3[1]}, [r1]!
-    veor q2, q2
-    veor q3, q3
-SNIP_END(_N_ColorMatrix_load_f32_2)
-
-SNIP_START(_N_ColorMatrix_load_f32_1)
-    vld1.32 {q0}, [r1]!
-    veor q1, q1
-    veor q2, q2
-    veor q3, q3
-SNIP_END(_N_ColorMatrix_load_f32_1)
-
-
-SNIP_START(_N_ColorMatrix_store_u8_4)
-#mov r0, #0
-    vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
-    vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
-    vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
-    vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
-SNIP_END(_N_ColorMatrix_store_u8_4)
-
-SNIP_START(_N_ColorMatrix_store_u8_2)
-    vst2.8 {d0[0],d1[0]}, [r0]!
-    vst2.8 {d0[1],d1[1]}, [r0]!
-    vst2.8 {d0[2],d1[2]}, [r0]!
-    vst2.8 {d0[3],d1[3]}, [r0]!
-SNIP_END(_N_ColorMatrix_store_u8_2)
-
-SNIP_START(_N_ColorMatrix_store_u8_1)
-    vst1.32 {d0[0]}, [r0]!
-SNIP_END(_N_ColorMatrix_store_u8_1)
-
-
-SNIP_START(_N_ColorMatrix_store_f32u_4)
-    vcvt.s32.f32 q0, q0
-    vcvt.s32.f32 q1, q1
-    vcvt.s32.f32 q2, q2
-    vcvt.s32.f32 q3, q3
-    vqmovn.s32 d0, q0
-    vqmovn.s32 d2, q1
-    vqmovn.s32 d4, q2
-    vqmovn.s32 d6, q3
-    vqmovun.s16 d0, q0
-    vqmovun.s16 d1, q1
-    vqmovun.s16 d2, q2
-    vqmovun.s16 d3, q3
-    vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
-    vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
-    vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
-    vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
-
-    #mov r0, #0
-    #ldr r0, [r0]
-
-SNIP_END(_N_ColorMatrix_store_f32u_4)
-
-SNIP_START(_N_ColorMatrix_store_f32u_2)
-    vcvt.s32.f32 q0, q0
-    vcvt.s32.f32 q1, q1
-    vqmovn.s32 d0, q0
-    vqmovn.s32 d2, q1
-    vqmovun.s16 d0, q0
-    vqmovun.s16 d1, q1
-    vst2.8 {d0[0],d1[0]}, [r0]!
-    vst2.8 {d0[1],d1[1]}, [r0]!
-    vst2.8 {d0[2],d1[2]}, [r0]!
-    vst2.8 {d0[3],d1[3]}, [r0]!
-SNIP_END(_N_ColorMatrix_store_f32u_2)
-
-SNIP_START(_N_ColorMatrix_store_f32u_1)
-    vcvt.s32.f32 q0, q0
-    vqmovn.s32 d0, q0
-    vqmovun.s16 d0, q0
-    vst1.32 {d0[0]}, [r0]!
-SNIP_END(_N_ColorMatrix_store_f32u_1)
-
-SNIP_START(_N_ColorMatrix_store_f32_4)
-    vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [r0]!
-    vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r0]!
-    vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [r0]!
-    vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r0]!
-SNIP_END(_N_ColorMatrix_store_f32_4)
-
-SNIP_START(_N_ColorMatrix_store_f32_3)
-    vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [r0]!
-    vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r0]!
-    vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [r0]!
-    vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r0]!
-SNIP_END(_N_ColorMatrix_store_f32_3)
-
-SNIP_START(_N_ColorMatrix_store_f32_2)
-    vst2.32 {d0[0],d2[0]}, [r0]!
-    vst2.32 {d0[1],d2[1]}, [r0]!
-    vst2.32 {d1[0],d3[0]}, [r0]!
-    vst2.32 {d1[1],d3[1]}, [r0]!
-SNIP_END(_N_ColorMatrix_store_f32_2)
-
-SNIP_START(_N_ColorMatrix_store_f32_1)
-    vst1.32 {q0}, [r0]!
-SNIP_END(_N_ColorMatrix_store_f32_1)
-
-
-SNIP_START(_N_ColorMatrix_unpack_u8_4)
-    vmovl.u8 q12, d0  /* R */
-    vmovl.u8 q13, d1  /* G */
-    vmovl.u8 q14, d2  /* B */
-    vmovl.u8 q15, d3  /* A */
-SNIP_END(_N_ColorMatrix_unpack_u8_4)
-
-SNIP_START(_N_ColorMatrix_unpack_u8_3)
-    vmovl.u8 q12, d0  /* R */
-    vmovl.u8 q13, d1  /* G */
-    vmovl.u8 q14, d2  /* B */
-    veor q15, q15
-SNIP_END(_N_ColorMatrix_unpack_u8_3)
-
-SNIP_START(_N_ColorMatrix_unpack_u8_2)
-    vmovl.u8 q12, d0  /* R */
-    vmovl.u8 q13, d1  /* G */
-    veor q14, q14
-    veor q15, q15
-SNIP_END(_N_ColorMatrix_unpack_u8_2)
-
-SNIP_START(_N_ColorMatrix_unpack_u8_1)
-    vmovl.u8 q12, d0  /* R */
-    veor q13, q13
-    veor q14, q14
-    veor q15, q15
-SNIP_END(_N_ColorMatrix_unpack_u8_1)
-
-SNIP_START(_N_ColorMatrix_pack_u8_4)
-    vqrshrn.s32 d24, q8, #8
-    vqrshrn.s32 d26, q9, #8
-    vqrshrn.s32 d28, q10, #8
-    vqrshrn.s32 d30, q11, #8
-    vqmovun.s16 d0, q12
-    vqmovun.s16 d1, q13
-    vqmovun.s16 d2, q14
-    vqmovun.s16 d3, q15
-SNIP_END(_N_ColorMatrix_pack_u8_4)
-
-SNIP_START(_N_ColorMatrix_pack_u8_3)
-    vqrshrn.s32 d24, q8, #8
-    vqrshrn.s32 d26, q9, #8
-    vqrshrn.s32 d28, q10, #8
-    vqmovun.s16 d0, q12
-    vqmovun.s16 d1, q13
-    vqmovun.s16 d2, q14
-SNIP_END(_N_ColorMatrix_pack_u8_3)
-
-SNIP_START(_N_ColorMatrix_pack_u8_2)
-    vqrshrn.s32 d24, q8, #8
-    vqrshrn.s32 d26, q9, #8
-    vqmovun.s16 d0, q12
-    vqmovun.s16 d1, q13
-SNIP_END(_N_ColorMatrix_pack_u8_2)
-
-SNIP_START(_N_ColorMatrix_pack_u8_1)
-    vqrshrn.s32 d24, q8, #8
-    vqmovun.s16 d0, q12
-SNIP_END(_N_ColorMatrix_pack_u8_1)
-
-SNIP_START(_N_ColorMatrix_dot)
-    vmov.u8 d1, d0
-    vmov.u8 d2, d0
-SNIP_END(_N_ColorMatrix_dot)
-

diff --git a/toolkit/Convolve3x3.cpp b/toolkit/Convolve3x3.cpp
deleted file mode 100644
index 51339a2..0000000
--- a/toolkit/Convolve3x3.cpp
+++ /dev/null

@@ -1,264 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-
-#include "RenderScriptToolkit.h"
-#include "TaskProcessor.h"
-#include "Utils.h"
-
-#define LOG_TAG "renderscript.toolkit.Convolve3x3"
-
-namespace android {
-namespace renderscript {
-
-extern "C" void rsdIntrinsicConvolve3x3_K(void* dst, const void* y0, const void* y1, const void* y2,
-                                          const int16_t* coef, uint32_t count);
-
-class Convolve3x3Task : public Task {
-    const void* mIn;
-    void* mOut;
-    // Even though we have exactly 9 coefficients, store them in an array of size 16 so that
-    // the SIMD instructions can load them in chunks multiple of 8.
-    float mFp[16];
-    int16_t mIp[16];
-
-    void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
-                  const uchar* py2);
-    void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
-                    size_t startX, size_t startY, size_t endX, size_t endY);
-
-    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
-    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                             size_t endY) override;
-
-   public:
-    Convolve3x3Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
-                    const float* coefficients, const Restriction* restriction)
-        : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
-        for (int ct = 0; ct < 9; ct++) {
-            mFp[ct] = coefficients[ct];
-            if (mFp[ct] >= 0) {
-                mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
-            } else {
-                mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
-            }
-        }
-    }
-};
-
-/**
- * Computes one convolution and stores the result in the output. This is used for uchar, uchar2,
- * uchar3, and uchar4 vectors.
- *
- * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. uchar4.
- * @tparam ComputationType Type we use for the intermediate computations.
- * @param x The index in the row of the value we'll convolve.
- * @param out The location in the output array where we store the value.
- * @param py0 The start of the top row.
- * @param py1 The start of the middle row.
- * @param py2 The start of the bottom row.
- * @param coeff Pointer to the float coefficients, in row major format.
- * @param sizeX The number of cells of one row.
- */
-template <typename InputOutputType, typename ComputationType>
-static void convolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
-                         const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
-                         int32_t sizeX) {
-    uint32_t x1 = std::max((int32_t)x - 1, 0);
-    uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
-
-    ComputationType px = convert<ComputationType>(py0[x1]) * coeff[0] +
-                         convert<ComputationType>(py0[x]) * coeff[1] +
-                         convert<ComputationType>(py0[x2]) * coeff[2] +
-                         convert<ComputationType>(py1[x1]) * coeff[3] +
-                         convert<ComputationType>(py1[x]) * coeff[4] +
-                         convert<ComputationType>(py1[x2]) * coeff[5] +
-                         convert<ComputationType>(py2[x1]) * coeff[6] +
-                         convert<ComputationType>(py2[x]) * coeff[7] +
-                         convert<ComputationType>(py2[x2]) * coeff[8];
-
-    px = clamp(px + 0.5f, 0.f, 255.f);
-    *out = convert<InputOutputType>(px);
-}
-
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-/**
- * Computes one convolution and stores the result in the output. This is used for float, float2,
- * float3, and float4 vectors.
- *
- * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. float4.
- * @param x The index in the row of the value we'll convolve.
- * @param out The location in the output array where we store the value.
- * @param py0 The start of the top row.
- * @param py1 The start of the middle row.
- * @param py2 The start of the bottom row.
- * @param coeff Pointer to the float coefficients, in row major format.
- * @param sizeX The number of cells of one row.
- */
-template <typename InputOutputType>
-static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
-                         const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
-                         int32_t sizeX) {
-    uint32_t x1 = std::max((int32_t)x - 1, 0);
-    uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
-    *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
-           (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
-           (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
-}
-#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-
-/**
- * This function convolves one line.
- *
- * @param pout Where to place the next output.
- * @param xstart Index in the X direction of where to start.
- * @param xend End index
- * @param ppy0 Points to the start of the previous line.
- * @param ppy1 Points to the start of the current line.
- * @param ppy2 Points to the start of the next line.
- */
-void Convolve3x3Task::kernelU4(uchar* pout, uint32_t xstart, uint32_t xend, const uchar* ppy0,
-                               const uchar* ppy1, const uchar* ppy2) {
-    uchar4* out = (uchar4*)pout;
-    const uchar4* py0 = (const uchar4*)ppy0;
-    const uchar4* py1 = (const uchar4*)ppy1;
-    const uchar4* py2 = (const uchar4*)ppy2;
-
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-    if (x1 == 0) {
-        convolveOneU<uchar4, float4>(0, out, py0, py1, py2, mFp, mSizeX);
-        x1++;
-        out++;
-    }
-
-    if (x2 > x1) {
-#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
-        if (mUsesSimd) {
-            int32_t len = (x2 - x1 - 1) >> 1;
-            if (len > 0) {
-                rsdIntrinsicConvolve3x3_K(out, &py0[x1 - 1], &py1[x1 - 1], &py2[x1 - 1], mIp, len);
-                x1 += len << 1;
-                out += len << 1;
-            }
-        }
-#endif
-
-        while (x1 != x2) {
-            convolveOneU<uchar4, float4>(x1, out, py0, py1, py2, mFp, mSizeX);
-            out++;
-            x1++;
-        }
-    }
-}
-
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-template <typename T>
-void RsdCpuScriptIntrinsicConvolve3x3_kernelF(void* in, T* out, uint32_t xstart, uint32_t xend,
-                                              uint32_t currentY, size_t sizeX, size_t sizeY,
-                                              size_t vectorSize, float* fp) {
-    const uchar* pin = (const uchar*)in;
-    const size_t stride = sizeX * vectorSize * 4;  // float takes 4 bytes
-
-    uint32_t y1 = std::min((int32_t)currentY + 1, (int32_t)(sizeY - 1));
-    uint32_t y2 = std::max((int32_t)currentY - 1, 0);
-    const T* py0 = (const T*)(pin + stride * y2);
-    const T* py1 = (const T*)(pin + stride * currentY);
-    const T* py2 = (const T*)(pin + stride * y1);
-
-    for (uint32_t x = xstart; x < xend; x++, out++) {
-        ConvolveOneF<T>(x, out, py0, py1, py2, fp, sizeX);
-    }
-}
-#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-
-template <typename InputOutputType, typename ComputationType>
-static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
-                      size_t startX, size_t startY, size_t endX, size_t endY, float* fp) {
-    const size_t stride = vectorSize * sizeX;
-    for (size_t y = startY; y < endY; y++) {
-        uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
-        uint32_t y2 = std::max((int32_t)y - 1, 0);
-
-        size_t offset = (y * sizeX + startX) * vectorSize;
-        InputOutputType* px = (InputOutputType*)(pout + offset);
-        InputOutputType* py0 = (InputOutputType*)(pin + stride * y2);
-        InputOutputType* py1 = (InputOutputType*)(pin + stride * y);
-        InputOutputType* py2 = (InputOutputType*)(pin + stride * y1);
-        for (uint32_t x = startX; x < endX; x++, px++) {
-            convolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, fp, sizeX);
-        }
-    }
-}
-
-void Convolve3x3Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
-                                 size_t sizeY, size_t startX, size_t startY, size_t endX,
-                                 size_t endY) {
-    const size_t stride = paddedSize(vectorSize) * sizeX;
-    for (size_t y = startY; y < endY; y++) {
-        uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
-        uint32_t y2 = std::max((int32_t)y - 1, 0);
-
-        size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
-        uchar* px = pout + offset;
-        const uchar* py0 = pin + stride * y2;
-        const uchar* py1 = pin + stride * y;
-        const uchar* py2 = pin + stride * y1;
-        kernelU4(px, startX, endX, py0, py1, py2);
-    }
-}
-
-void Convolve3x3Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
-                                  size_t endY) {
-    // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
-    // endX, endY);
-    switch (mVectorSize) {
-        case 1:
-            convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
-                                    startX, startY, endX, endY, mFp);
-            break;
-        case 2:
-            convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
-                                      startX, startY, endX, endY, mFp);
-            break;
-        case 3:
-        case 4:
-            convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
-                       endX, endY);
-            break;
-    }
-}
-
-void RenderScriptToolkit::convolve3x3(const void* in, void* out, size_t vectorSize, size_t sizeX,
-                                      size_t sizeY, const float* coefficients,
-                                      const Restriction* restriction) {
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
-    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
-        return;
-    }
-    if (vectorSize < 1 || vectorSize > 4) {
-        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
-        return;
-    }
-#endif
-
-    Convolve3x3Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
-    processor->doTask(&task);
-}
-
-}  // namespace renderscript
-}  // namespace android

diff --git a/toolkit/Convolve5x5.cpp b/toolkit/Convolve5x5.cpp
deleted file mode 100644
index 1f3f75c..0000000
--- a/toolkit/Convolve5x5.cpp
+++ /dev/null

@@ -1,350 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-
-#include "RenderScriptToolkit.h"
-#include "TaskProcessor.h"
-#include "Utils.h"
-
-namespace android {
-namespace renderscript {
-
-#define LOG_TAG "renderscript.toolkit.Convolve5x5"
-
-extern "C" void rsdIntrinsicConvolve5x5_K(void* dst, const void* y0, const void* y1, const void* y2,
-                                          const void* y3, const void* y4, const int16_t* coef,
-                                          uint32_t count);
-
-class Convolve5x5Task : public Task {
-    const void* mIn;
-    void* mOut;
-    // Even though we have exactly 25 coefficients, store them in an array of size 28 so that
-    // the SIMD instructions can load them in three chunks of 8 and 1 of chunk of 4.
-    float mFp[28];
-    int16_t mIp[28];
-
-    void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
-                  const uchar* py2, const uchar* py3, const uchar* py4);
-    void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
-                    size_t startX, size_t startY, size_t endX, size_t endY);
-
-    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
-    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                             size_t endY) override;
-
-   public:
-    Convolve5x5Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
-                    const float* coefficients, const Restriction* restriction)
-        : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
-        for (int ct = 0; ct < 25; ct++) {
-            mFp[ct] = coefficients[ct];
-            if (mFp[ct] >= 0) {
-                mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
-            } else {
-                mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
-            }
-        }
-    }
-};
-
-template <typename InputOutputType, typename ComputationType>
-static void ConvolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
-                         const InputOutputType* py1, const InputOutputType* py2,
-                         const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
-                         int32_t width) {
-    uint32_t x0 = std::max((int32_t)x - 2, 0);
-    uint32_t x1 = std::max((int32_t)x - 1, 0);
-    uint32_t x2 = x;
-    uint32_t x3 = std::min((int32_t)x + 1, width - 1);
-    uint32_t x4 = std::min((int32_t)x + 2, width - 1);
-
-    ComputationType px = convert<ComputationType>(py0[x0]) * coeff[0] +
-                         convert<ComputationType>(py0[x1]) * coeff[1] +
-                         convert<ComputationType>(py0[x2]) * coeff[2] +
-                         convert<ComputationType>(py0[x3]) * coeff[3] +
-                         convert<ComputationType>(py0[x4]) * coeff[4] +
-
-                         convert<ComputationType>(py1[x0]) * coeff[5] +
-                         convert<ComputationType>(py1[x1]) * coeff[6] +
-                         convert<ComputationType>(py1[x2]) * coeff[7] +
-                         convert<ComputationType>(py1[x3]) * coeff[8] +
-                         convert<ComputationType>(py1[x4]) * coeff[9] +
-
-                         convert<ComputationType>(py2[x0]) * coeff[10] +
-                         convert<ComputationType>(py2[x1]) * coeff[11] +
-                         convert<ComputationType>(py2[x2]) * coeff[12] +
-                         convert<ComputationType>(py2[x3]) * coeff[13] +
-                         convert<ComputationType>(py2[x4]) * coeff[14] +
-
-                         convert<ComputationType>(py3[x0]) * coeff[15] +
-                         convert<ComputationType>(py3[x1]) * coeff[16] +
-                         convert<ComputationType>(py3[x2]) * coeff[17] +
-                         convert<ComputationType>(py3[x3]) * coeff[18] +
-                         convert<ComputationType>(py3[x4]) * coeff[19] +
-
-                         convert<ComputationType>(py4[x0]) * coeff[20] +
-                         convert<ComputationType>(py4[x1]) * coeff[21] +
-                         convert<ComputationType>(py4[x2]) * coeff[22] +
-                         convert<ComputationType>(py4[x3]) * coeff[23] +
-                         convert<ComputationType>(py4[x4]) * coeff[24];
-    px = clamp(px + 0.5f, 0.f, 255.f);
-    *out = convert<InputOutputType>(px);
-}
-
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-template <typename InputOutputType>
-static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
-                         const InputOutputType* py1, const InputOutputType* py2,
-                         const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
-                         int32_t width) {
-    uint32_t x0 = std::max((int32_t)x - 2, 0);
-    uint32_t x1 = std::max((int32_t)x - 1, 0);
-    uint32_t x2 = x;
-    uint32_t x3 = std::min((int32_t)x + 1, width - 1);
-    uint32_t x4 = std::min((int32_t)x + 2, width - 1);
-
-    InputOutputType px = py0[x0] * coeff[0] + py0[x1] * coeff[1] + py0[x2] * coeff[2] +
-                         py0[x3] * coeff[3] + py0[x4] * coeff[4] +
-
-                         py1[x0] * coeff[5] + py1[x1] * coeff[6] + py1[x2] * coeff[7] +
-                         py1[x3] * coeff[8] + py1[x4] * coeff[9] +
-
-                         py2[x0] * coeff[10] + py2[x1] * coeff[11] + py2[x2] * coeff[12] +
-                         py2[x3] * coeff[13] + py2[x4] * coeff[14] +
-
-                         py3[x0] * coeff[15] + py3[x1] * coeff[16] + py3[x2] * coeff[17] +
-                         py3[x3] * coeff[18] + py3[x4] * coeff[19] +
-
-                         py4[x0] * coeff[20] + py4[x1] * coeff[21] + py4[x2] * coeff[22] +
-                         py4[x3] * coeff[23] + py4[x4] * coeff[24];
-    *out = px;
-}
-#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-
-/**
- * This function convolves one line.
- *
- * @param pout Where to place the next output.
- * @param xstart Index in the X direction of where to start.
- * @param xend End index
- * @param ppy0 Points to the start of the line two above.
- * @param ppy1 Points to the start of the line one above.
- * @param ppy2 Points to the start of the current line.
- * @param ppy3 Points to the start of the line one below.
- * @param ppy4 Points to the start of the line two below.
- */
-void Convolve5x5Task::kernelU4(uchar* pout, uint32_t x1, uint32_t x2, const uchar* ppy0,
-                               const uchar* ppy1, const uchar* ppy2, const uchar* ppy3,
-                               const uchar* ppy4) {
-    uchar4* out = (uchar4*)pout;
-    const uchar4* py0 = (const uchar4*)ppy0;
-    const uchar4* py1 = (const uchar4*)ppy1;
-    const uchar4* py2 = (const uchar4*)ppy2;
-    const uchar4* py3 = (const uchar4*)ppy3;
-    const uchar4* py4 = (const uchar4*)ppy4;
-
-    while ((x1 < x2) && (x1 < 2)) {
-        ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
-        out++;
-        x1++;
-    }
-#if defined(ARCH_X86_HAVE_SSSE3)
-    // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
-    // 3 for end boundary where x may hit the end boundary)
-    if (mUsesSimd && ((x1 + 6) < x2)) {
-        // subtract 3 for end boundary
-        uint32_t len = (x2 - x1 - 3) >> 2;
-        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
-                                  py4 + x1 - 2, mIp, len);
-        out += len << 2;
-        x1 += len << 2;
-    }
-#endif
-
-#if defined(ARCH_ARM_USE_INTRINSICS)
-    if (mUsesSimd && ((x1 + 3) < x2)) {
-        uint32_t len = (x2 - x1 - 3) >> 1;
-        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
-                                  py4 + x1 - 2, mIp, len);
-        out += len << 1;
-        x1 += len << 1;
-    }
-#endif
-
-    while (x1 < x2) {
-        ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
-        out++;
-        x1++;
-    }
-}
-
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-// This will need more cleanup before it can be used.
-void Convolve5x5Task::kernelF4(const ConvolveInfo* info, float4* out,
-                               uint32_t xstart, uint32_t xend, uint32_t currentY) {
-    const uchar* pin = (const uchar*)info->in;
-    const size_t stride = info->stride;
-
-    uint32_t y0 = std::max((int32_t)currentY - 2, 0);
-    uint32_t y1 = std::max((int32_t)currentY - 1, 0);
-    uint32_t y2 = currentY;
-    uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
-    uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
-
-    const float4* py0 = (const float4*)(pin + stride * y0);
-    const float4* py1 = (const float4*)(pin + stride * y1);
-    const float4* py2 = (const float4*)(pin + stride * y2);
-    const float4* py3 = (const float4*)(pin + stride * y3);
-    const float4* py4 = (const float4*)(pin + stride * y4);
-
-    for (uint32_t x = xstart; x < xend; x++, out++) {
-        ConvolveOneF<float4>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
-    }
-}
-
-void RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo* info, float2* out,
-                                               uint32_t xstart, uint32_t xend, uint32_t currentY) {
-    const uchar* pin = (const uchar*)info->in;
-    const size_t stride = info->stride;
-
-    uint32_t y0 = std::max((int32_t)currentY - 2, 0);
-    uint32_t y1 = std::max((int32_t)currentY - 1, 0);
-    uint32_t y2 = currentY;
-    uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
-    uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
-
-    const float2* py0 = (const float2*)(pin + stride * y0);
-    const float2* py1 = (const float2*)(pin + stride * y1);
-    const float2* py2 = (const float2*)(pin + stride * y2);
-    const float2* py3 = (const float2*)(pin + stride * y3);
-    const float2* py4 = (const float2*)(pin + stride * y4);
-
-    for (uint32_t x = xstart; x < xend; x++, out++) {
-        ConvolveOneF<float2>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
-    }
-}
-
-void RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo* info, float* out,
-                                               uint32_t xstart, uint32_t xend, uint32_t currentY) {
-    const uchar* pin = (const uchar*)info->in;
-    const size_t stride = info->stride;
-
-    uint32_t y0 = std::max((int32_t)currentY - 2, 0);
-    uint32_t y1 = std::max((int32_t)currentY - 1, 0);
-    uint32_t y2 = currentY;
-    uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
-    uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
-
-    const float* py0 = (const float*)(pin + stride * y0);
-    const float* py1 = (const float*)(pin + stride * y1);
-    const float* py2 = (const float*)(pin + stride * y2);
-    const float* py3 = (const float*)(pin + stride * y3);
-    const float* py4 = (const float*)(pin + stride * y4);
-
-    for (uint32_t x = xstart; x < xend; x++, out++) {
-        ConvolveOneF<float>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
-    }
-}
-#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-
-template <typename InputOutputType, typename ComputationType>
-static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
-                      size_t startX, size_t startY, size_t endX, size_t endY, float* mFp) {
-    const size_t stride = vectorSize * sizeX;
-    for (size_t y = startY; y < endY; y++) {
-        uint32_t y0 = std::max((int32_t)y - 2, 0);
-        uint32_t y1 = std::max((int32_t)y - 1, 0);
-        uint32_t y2 = y;
-        uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
-        uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
-
-        size_t offset = (y * sizeX + startX) * vectorSize;
-        InputOutputType* px = (InputOutputType*)(pout + offset);
-        InputOutputType* py0 = (InputOutputType*)(pin + stride * y0);
-        InputOutputType* py1 = (InputOutputType*)(pin + stride * y1);
-        InputOutputType* py2 = (InputOutputType*)(pin + stride * y2);
-        InputOutputType* py3 = (InputOutputType*)(pin + stride * y3);
-        InputOutputType* py4 = (InputOutputType*)(pin + stride * y4);
-        for (uint32_t x = startX; x < endX; x++, px++) {
-            ConvolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, py3, py4, mFp,
-                                                           sizeX);
-        }
-    }
-}
-
-void Convolve5x5Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
-                                 size_t sizeY, size_t startX, size_t startY, size_t endX,
-                                 size_t endY) {
-    const size_t stride = paddedSize(vectorSize) * sizeX;
-    for (size_t y = startY; y < endY; y++) {
-        uint32_t y0 = std::max((int32_t)y - 2, 0);
-        uint32_t y1 = std::max((int32_t)y - 1, 0);
-        uint32_t y2 = y;
-        uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
-        uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
-
-        size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
-        uchar* px = pout + offset;
-        const uchar* py0 = pin + stride * y0;
-        const uchar* py1 = pin + stride * y1;
-        const uchar* py2 = pin + stride * y2;
-        const uchar* py3 = pin + stride * y3;
-        const uchar* py4 = pin + stride * y4;
-        kernelU4(px, startX, endX, py0, py1, py2, py3, py4);
-    }
-}
-
-void Convolve5x5Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
-                                  size_t endY) {
-    // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
-    // endX, endY);
-    switch (mVectorSize) {
-        case 1:
-            convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
-                                    startX, startY, endX, endY, mFp);
-            break;
-        case 2:
-            convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
-                                      startX, startY, endX, endY, mFp);
-            break;
-        case 3:
-        case 4:
-            convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
-                       endX, endY);
-            break;
-    }
-}
-
-void RenderScriptToolkit::convolve5x5(const void* in, void* out, size_t vectorSize, size_t sizeX,
-                                      size_t sizeY, const float* coefficients,
-                                      const Restriction* restriction) {
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
-    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
-        return;
-    }
-    if (vectorSize < 1 || vectorSize > 4) {
-        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
-        return;
-    }
-#endif
-
-    Convolve5x5Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
-    processor->doTask(&task);
-}
-
-}  // namespace renderscript
-}  // namespace android

diff --git a/toolkit/Convolve_advsimd.S b/toolkit/Convolve_advsimd.S
deleted file mode 100644
index 0daa0c5..0000000
--- a/toolkit/Convolve_advsimd.S
+++ /dev/null

@@ -1,265 +0,0 @@
-/*
- * Copyright (C) 2012,2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-        x0 = dst
-        x1 = y0 base pointer
-        x2 = y1 base pointer
-        x3 = y2 base pointer
-        x4 = coeffs
-        x5 = length / 2
-*/
-
-#define ENTRY(f) .text; .align 2; .globl f; .type f,#function; f:
-#define END(f) .size f, .-f;
-
-ENTRY(rsdIntrinsicConvolve3x3_K)
-        sub             x6, sp, #64
-        sub             sp, sp, #64
-        st1             {v8.1d-v11.1d}, [x6], #32
-        st1             {v12.1d-v15.1d}, [x6]
-
-        /* Load the coefficients in the v0, v1 registers */
-        ld1     {v0.8h, v1.8h}, [x4]
-
-        /* Load the frequently used immediate in a register */
-        mov x4, #8
-
-1:
-        /* Load and post-increase the address by x4=#8 */
-        ld1     {v13.16b}, [x1], x4
-        ld1     {v14.16b}, [x2], x4
-        ld1     {v15.16b}, [x3], x4
-
-        /* Signal memory for data that will be used in the loop after the next */
-//        prfm        PLDL1KEEP,[x1, x4] // TODO: test this
-//        prfm        PLDL1KEEP,[x2, x4] // TODO: test this
-//        prfm        PLDL1KEEP,[x3, x4] // TODO: test this
-
-        uxtl      v2.8h, v13.8b
-        uxtl2     v3.8h, v13.16b
-        uxtl      v4.8h, v14.8b
-        uxtl2     v5.8h, v14.16b
-        uxtl      v6.8h, v15.8b
-        uxtl2     v7.8h, v15.16b
-
-/*
-        The two pixel source array is
-        v2,  v2hi,  v3lo,  v3hi
-        v4,  v4hi,  v5lo, v5hi
-        v6, v6hi, v7lo, v7hi
-*/
-
-        smull     v8.4s, v2.4h, v0.h[0]
-        smull2    v9.4s, v2.8h, v0.h[0]
-        smlal2    v8.4s, v2.8h, v0.h[1]
-        smlal     v9.4s, v3.4h, v0.h[1]
-        smlal     v8.4s, v3.4h, v0.h[2]
-        smlal2    v9.4s, v3.8h, v0.h[2]
-        smlal     v8.4s, v4.4h, v0.h[3]
-        smlal2    v9.4s, v4.8h, v0.h[3]
-        smlal2    v8.4s, v4.8h, v0.h[4]
-        smlal     v9.4s, v5.4h, v0.h[4]
-        smlal     v8.4s, v5.4h, v0.h[5]
-        smlal2    v9.4s, v5.8h, v0.h[5]
-        smlal     v8.4s, v6.4h, v0.h[6]
-        smlal2    v9.4s, v6.8h, v0.h[6]
-        smlal2    v8.4s, v6.8h, v0.h[7]
-        smlal     v9.4s, v7.4h, v0.h[7]
-        smlal     v8.4s, v7.4h, v1.h[0]
-        smlal2    v9.4s, v7.8h, v1.h[0]
-
-        shrn      v8.4h, v8.4s, #8
-        shrn2     v8.8h, v9.4s, #8
-
-        sqxtun      v8.8b, v8.8h
-        st1         {v8.8b}, [x0], #8
-
-        /* Are we done yet? */
-        subs x5, x5, #1
-        bne 1b
-
-        /* We're done, bye! */
-        ld1             {v8.1d-v11.1d}, [sp], #32
-        ld1             {v12.1d-v15.1d}, [sp], #32
-        ret
-END(rsdIntrinsicConvolve3x3_K)
-
-
-/* Convolve 5x5 */
-
-/*
-        x0 = dst
-        x1 = y0 base pointer
-        x2 = y1 base pointer
-        x3 = y2 base pointer
-        x4 = y3 base pointer
-        x5 = y4 base pointer
-        x6 = coeffs
-        x7 = length
-*/
-ENTRY(rsdIntrinsicConvolve5x5_K)
-        sub         x8, sp, #64
-        sub         sp, sp, #64
-        st1         {v8.1d-v11.1d}, [x8], #32
-        st1         {v12.1d-v15.1d}, [x8]
-
-        /* Create the coefficients vector  */
-        ld1         {v0.8h-v2.8h}, [x6], #48
-        ld1         {v3.4h}, [x6], #8
-
-        movi      v15.4s, #0x7f
-
-        /* Load the frequently used immediate in a register */
-        mov     x6, #8
-
-1:
-        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
-        ld1     {v9.8b-v11.8b}, [x1], x6      //  y0 ( y - 2 )
-        ld1     {v12.8b-v14.8b}, [x2], x6      //  y0 ( y - 1 )
-
-        /* Signal memory for data that will be used in the loop after the next */
-//        prfm        PLDL1KEEP,[x1, x6] // TODO: test this
-//        prfm        PLDL1KEEP,[x2, x6] // TODO: test this
-
-        /* Promoting the 8bit channels to 16bit */
-        uxtl      v9.8h,  v9.8b
-        uxtl      v10.8h, v10.8b
-        uxtl      v11.8h, v11.8b
-        uxtl      v12.8h, v12.8b
-        uxtl      v13.8h, v13.8b
-        uxtl      v14.8h, v14.8b
-
-/*
-        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
-        v12,  v12hi
-*/
-        smull     v4.4s, v9.4h, v0.h[0]
-        smull2    v5.4s, v9.8h, v0.h[0]
-        smlal2    v4.4s, v9.8h, v0.h[1]
-        smlal     v5.4s, v10.4h, v0.h[1]
-        smlal     v4.4s, v10.4h, v0.h[2]
-        smlal2    v5.4s, v10.8h, v0.h[2]
-        smlal2    v4.4s, v10.8h, v0.h[3]
-        smlal     v5.4s, v11.4h, v0.h[3]
-        smlal     v4.4s, v11.4h, v0.h[4]
-        smlal2    v5.4s, v11.8h, v0.h[4]
-
-        smlal     v4.4s, v12.4h, v0.h[5]
-        smlal2    v5.4s, v12.8h, v0.h[5]
-        smlal2    v4.4s, v12.8h, v0.h[6]
-        smlal     v5.4s, v13.4h, v0.h[6]
-        smlal     v4.4s, v13.4h, v0.h[7]
-        smlal2    v5.4s, v13.8h, v0.h[7]
-        smlal2    v4.4s, v13.8h, v1.h[0]
-        smlal     v5.4s, v14.4h, v1.h[0]
-        smlal     v4.4s, v14.4h, v1.h[1]
-        smlal2    v5.4s, v14.8h, v1.h[1]
-
-        /* Next 2 rows */
-        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
-        ld1     {v9.8b-v11.8b}, [x3], x6      //  y0 ( y )
-        ld1     {v12.8b-v14.8b}, [x4], x6      //  y0 ( y + 1 )
-
-        /* Signal memory for data that will be used in the loop after the next */
-//        prfm        PLDL1KEEP,[x3, x6] // TODO: test this
-//        prfm        PLDL1KEEP,[x4, x6] // TODO: test this
-
-        /* Promoting the 8bit channels to 16bit */
-        uxtl      v9.8h,  v9.8b
-        uxtl      v10.8h, v10.8b
-        uxtl      v11.8h, v11.8b
-        uxtl      v12.8h, v12.8b
-        uxtl      v13.8h, v13.8b
-        uxtl      v14.8h, v14.8b
-
-/*
-        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
-        v12,  v12hi
-*/
-        smlal     v4.4s, v9.4h, v1.h[2]
-        smlal2    v5.4s, v9.8h, v1.h[2]
-        smlal2    v4.4s, v9.8h, v1.h[3]
-        smlal     v5.4s, v10.4h, v1.h[3]
-        smlal     v4.4s, v10.4h, v1.h[4]
-        smlal2    v5.4s, v10.8h, v1.h[4]
-        smlal2    v4.4s, v10.8h, v1.h[5]
-        smlal     v5.4s, v11.4h, v1.h[5]
-        smlal     v4.4s, v11.4h, v1.h[6]
-        smlal2    v5.4s, v11.8h, v1.h[6]
-
-        smlal     v4.4s, v12.4h, v1.h[7]
-        smlal2    v5.4s, v12.8h, v1.h[7]
-        smlal2    v4.4s, v12.8h, v2.h[0]
-        smlal     v5.4s, v13.4h, v2.h[0]
-        smlal     v4.4s, v13.4h, v2.h[1]
-        smlal2    v5.4s, v13.8h, v2.h[1]
-        smlal2    v4.4s, v13.8h, v2.h[2]
-        smlal     v5.4s, v14.4h, v2.h[2]
-        smlal     v4.4s, v14.4h, v2.h[3]
-        smlal2    v5.4s, v14.8h, v2.h[3]
-
-        /* Last row */
-        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
-        ld1     {v9.8b- v11.8b}, [x5], x6      //  y0 ( y + 2 )
-
-        /* Signal memory for data that will be used in the loop after the next */
-//        prfm        PLDL1KEEP,[x5, x6] // TODO: test this
-
-        /* Promoting the 8bit channels to 16bit */
-        uxtl      v9.8h,  v9.8b
-        uxtl      v10.8h, v10.8b
-        uxtl      v11.8h, v11.8b
-
-/*
-        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
-        v12,  v12hi
-*/
-
-        smlal     v4.4s, v9.4h, v2.h[4]
-        smlal2    v5.4s, v9.8h, v2.h[4]
-        smlal2    v4.4s, v9.8h, v2.h[5]
-        smlal     v5.4s, v10.4h, v2.h[5]
-        smlal     v4.4s, v10.4h, v2.h[6]
-        smlal2    v5.4s, v10.8h, v2.h[6]
-        smlal2    v4.4s, v10.8h, v2.h[7]
-        smlal     v5.4s, v11.4h, v2.h[7]
-        smlal     v4.4s, v11.4h, v3.h[0]
-        smlal2    v5.4s, v11.8h, v3.h[0]
-
-        add      v4.4s, v4.4s, v15.4s
-        add      v5.4s, v5.4s, v15.4s
-
-/*      Narrow it to a d-reg 32 -> 16 bit */
-        rshrn      v4.4h, v4.4s, #8
-        rshrn2     v4.8h, v5.4s, #8
-
-
-/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
-        sqxtun      v4.8b, v4.8h
-
-        st1     {v4.8b}, [x0], #8        // return the output and increase the address of x0
-
-        /* Are we done? */
-        subs x7, x7, #1
-        bne 1b
-
-        /* Yup, bye */
-        ld1         {v8.1d-v11.1d}, [sp], #32
-        ld1         {v12.1d-v15.1d}, [sp], #32
-        ret
-
-END(rsdIntrinsicConvolve5x5_K)

diff --git a/toolkit/Convolve_neon.S b/toolkit/Convolve_neon.S
deleted file mode 100644
index ee10884..0000000
--- a/toolkit/Convolve_neon.S
+++ /dev/null

@@ -1,287 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-        r0 = dst
-        r1 = y0 base pointer
-        r2 = y1 base pointer
-        r3 = y2 base pointer
-        sp = coeffs
-        sp = length / 2
-*/
-
-#define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart
-#define END(f) .fnend; .size f, .-f;
-
-ENTRY(rsdIntrinsicConvolve3x3_K)
-        push            {r4-r8, r10, r11, lr}
-        vpush           {q4-q7}
-
-        /* Get the coeffs pointer from the stack and load the
-           coefficients in the q0, q1 NEON registers */
-        ldr r4, [sp, #32+64]
-        vld1.16 {q0, q1}, [r4]
-
-        /* Get count from the stack */
-        ldr r4, [sp, #36+64]
-
-        /* Load the frequently used immediate in a register */
-        mov r5, #8
-
-1:
-        /* Load and post-increase the address by r5=#8 */
-        vld1.8 {q13}, [r1], r5
-        vld1.8 {q14}, [r2], r5
-        vld1.8 {q15}, [r3], r5
-
-        /* Signal memory for data that will be used in the loop after the next */
-        pld         [r1, r5]
-        pld         [r2, r5]
-        pld         [r3, r5]
-
-        vmovl.u8 q2, d26
-        vmovl.u8 q3, d27
-        vmovl.u8 q4, d28
-        vmovl.u8 q5, d29
-        vmovl.u8 q6, d30
-        vmovl.u8 q7, d31
-
-/*
-        The two pixel source array is
-        d4,  d5,  d6,  d7
-        d8,  d9,  d10, d11
-        d12, d13, d14, d15
-*/
-
-        vmull.s16 q8, d4, d0[0]
-        vmlal.s16 q8, d5, d0[1]
-        vmlal.s16 q8, d6, d0[2]
-        vmlal.s16 q8, d8, d0[3]
-        vmlal.s16 q8, d9, d1[0]
-        vmlal.s16 q8, d10, d1[1]
-        vmlal.s16 q8, d12, d1[2]
-        vmlal.s16 q8, d13, d1[3]
-        vmlal.s16 q8, d14, d2[0]
-
-        vmull.s16 q9, d5, d0[0]
-        vmlal.s16 q9, d6, d0[1]
-        vmlal.s16 q9, d7, d0[2]
-        vmlal.s16 q9, d9, d0[3]
-        vmlal.s16 q9, d10, d1[0]
-        vmlal.s16 q9, d11, d1[1]
-        vmlal.s16 q9, d13, d1[2]
-        vmlal.s16 q9, d14, d1[3]
-        vmlal.s16 q9, d15, d2[0]
-
-        vshrn.i32 d16, q8, #8
-        vshrn.i32 d17, q9, #8
-
-        vqmovun.s16 d16, q8
-        vst1.8 d16, [r0]!
-
-        /* Are we done yet? */
-        subs r4, r4, #1
-        bne 1b
-
-        /* We're done, bye! */
-        vpop            {q4-q7}
-        pop             {r4-r8, r10, r11, lr}
-        bx              lr
-END(rsdIntrinsicConvolve3x3_K)
-
-
-/* Convolve 5x5 */
-
-/*
-        r0 = dst
-        r1 = y0 base pointer
-        r2 = y1 base pointer
-        r3 = y2 base pointer
-        r4 = y3 base pointer
-        r5 = y4 base pointer
-        r6 = coeffs
-        r7 = length
-*/
-ENTRY(rsdIntrinsicConvolve5x5_K)
-        push        {r4-r7, lr}
-        vpush       {q4-q7}
-
-        /* load y3 in r4 */
-        ldr     r4, [sp, #20 + 64]
-
-        /* load y4 in r5 */
-        ldr     r5, [sp, #24 + 64]
-
-        /* Load the coefficients pointer */
-        ldr     r6, [sp, #28 + 64]
-
-        /* Create the coefficients vector */
-        vld1.16     {d0, d1, d2, d3}, [r6]!
-        vld1.16     {d4, d5, d6}, [r6]
-
-        vmov.u32  q15, #0x7f
-
-        /* load the count */
-        ldr     r6, [sp, #32 + 64]
-
-        /* Load the frequently used immediate in a register */
-        mov     r7, #8
-
-1:
-        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
-        vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
-        vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
-
-        /* Signal memory for data that will be used in the loop after the next */
-        pld         [r1, r7]
-        pld         [r2, r7]
-
-        /* Promoting the 8bit channels to 16bit */
-        vmovl.u8 q9,  d24
-        vmovl.u8 q10, d25
-        vmovl.u8 q11, d26
-        vmovl.u8 q12, d27
-        vmovl.u8 q13, d28
-        vmovl.u8 q14, d29
-
-/*
-        d18,  d19,  d20, d21, d22, d23,
-        d24,  d25
-*/
-        vmull.s16 q4, d18, d0[0]
-        vmlal.s16 q4, d19, d0[1]
-        vmlal.s16 q4, d20, d0[2]
-        vmlal.s16 q4, d21, d0[3]
-        vmlal.s16 q4, d22, d1[0]
-
-        vmlal.s16 q4, d24, d1[1]
-        vmlal.s16 q4, d25, d1[2]
-        vmlal.s16 q4, d26, d1[3]
-        vmlal.s16 q4, d27, d2[0]
-        vmlal.s16 q4, d28, d2[1]
-
-        vmull.s16 q5, d19, d0[0]
-        vmlal.s16 q5, d20, d0[1]
-        vmlal.s16 q5, d21, d0[2]
-        vmlal.s16 q5, d22, d0[3]
-        vmlal.s16 q5, d23, d1[0]
-
-        vmlal.s16 q5, d25, d1[1]
-        vmlal.s16 q5, d26, d1[2]
-        vmlal.s16 q5, d27, d1[3]
-        vmlal.s16 q5, d28, d2[0]
-        vmlal.s16 q5, d29, d2[1]
-
-
-        /* Next 2 rows */
-        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
-        vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
-        vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
-
-        /* Signal memory for data that will be used in the loop after the next */
-        pld         [r3, r7]
-        pld         [r4, r7]
-
-        /* Promoting the 8bit channels to 16bit */
-        vmovl.u8 q9,  d24
-        vmovl.u8 q10, d25
-        vmovl.u8 q11, d26
-        vmovl.u8 q12, d27
-        vmovl.u8 q13, d28
-        vmovl.u8 q14, d29
-
-/*
-        d18,  d19,  d20, d21, d22, d23,
-        d24,  d25
-*/
-        vmlal.s16 q4, d18, d2[2]
-        vmlal.s16 q4, d19, d2[3]
-        vmlal.s16 q4, d20, d3[0]
-        vmlal.s16 q4, d21, d3[1]
-        vmlal.s16 q4, d22, d3[2]
-
-        vmlal.s16 q4, d24, d3[3]
-        vmlal.s16 q4, d25, d4[0]
-        vmlal.s16 q4, d26, d4[1]
-        vmlal.s16 q4, d27, d4[2]
-        vmlal.s16 q4, d28, d4[3]
-
-        vmlal.s16 q5, d19, d2[2]
-        vmlal.s16 q5, d20, d2[3]
-        vmlal.s16 q5, d21, d3[0]
-        vmlal.s16 q5, d22, d3[1]
-        vmlal.s16 q5, d23, d3[2]
-
-        vmlal.s16 q5, d25, d3[3]
-        vmlal.s16 q5, d26, d4[0]
-        vmlal.s16 q5, d27, d4[1]
-        vmlal.s16 q5, d28, d4[2]
-        vmlal.s16 q5, d29, d4[3]
-
-        /* Last row */
-        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
-        vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
-
-        /* Signal memory for data that will be used in the loop after the next */
-        pld         [r5, r7]
-
-        /* Promoting the 8bit channels to 16bit */
-        vmovl.u8 q9,  d24
-        vmovl.u8 q10, d25
-        vmovl.u8 q11, d26
-
-/*
-        d18,  d19,  d20, d21, d22, d23,
-        d24,  d25
-*/
-
-        vmlal.s16 q4, d18, d5[0]
-        vmlal.s16 q4, d19, d5[1]
-        vmlal.s16 q4, d20, d5[2]
-        vmlal.s16 q4, d21, d5[3]
-        vmlal.s16 q4, d22, d6[0]
-
-        vmlal.s16 q5, d19, d5[0]
-        vmlal.s16 q5, d20, d5[1]
-        vmlal.s16 q5, d21, d5[2]
-        vmlal.s16 q5, d22, d5[3]
-        vmlal.s16 q5, d23, d6[0]
-
-
-
-        vadd.i32 q4, q4, q15
-        vadd.i32 q5, q5, q15
-
-/*      Narrow it to a d-reg 32 -> 16 bit */
-        vrshrn.i32 d8, q4, #8
-        vrshrn.i32 d9, q5, #8
-
-
-/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
-        vqmovun.s16 d8, q4
-
-        vst1.8 d8, [r0]!           @ return the output and increase the address of r0
-
-        /* Are we done? */
-        subs r6, r6, #1
-        bne 1b
-
-        /* Yup, bye */
-        vpop        {q4-q7}
-        pop         {r4-r7, lr}
-        bx          lr
-
-END(rsdIntrinsicConvolve5x5_K)

diff --git a/toolkit/Histogram.cpp b/toolkit/Histogram.cpp
deleted file mode 100644
index 86b4bed..0000000
--- a/toolkit/Histogram.cpp
+++ /dev/null

@@ -1,299 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <array>
-#include <cstdint>
-
-#include "RenderScriptToolkit.h"
-#include "TaskProcessor.h"
-#include "Utils.h"
-
-#define LOG_TAG "renderscript.toolkit.Histogram"
-
-namespace android {
-namespace renderscript {
-
-class HistogramTask : public Task {
-    const uchar* mIn;
-    std::vector<int> mSums;
-    uint32_t mThreadCount;
-
-    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
-    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                             size_t endY) override;
-
-    void kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
-    void kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
-    void kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
-    void kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
-
-   public:
-    HistogramTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize,
-                  uint32_t threadCount, const Restriction* restriction);
-    void collateSums(int* out);
-};
-
-class HistogramDotTask : public Task {
-    const uchar* mIn;
-    float mDot[4];
-    int mDotI[4];
-    std::vector<int> mSums;
-    uint32_t mThreadCount;
-
-    void kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
-    void kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
-    void kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
-    void kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
-
-   public:
-    HistogramDotTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize,
-                     uint32_t threadCount, const float* coefficients,
-                     const Restriction* restriction);
-    void collateSums(int* out);
-
-    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                             size_t endY) override;
-};
-
-HistogramTask::HistogramTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize,
-                             uint32_t threadCount, const Restriction* restriction)
-    : Task{sizeX, sizeY, vectorSize, true, restriction},
-      mIn{in},
-      mSums(256 * paddedSize(vectorSize) * threadCount) {
-    mThreadCount = threadCount;
-}
-
-void HistogramTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                                size_t endY) {
-    typedef void (HistogramTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t);
-
-    KernelFunction kernel;
-    switch (mVectorSize) {
-        case 4:
-            kernel = &HistogramTask::kernelP1U4;
-            break;
-        case 3:
-            kernel = &HistogramTask::kernelP1U3;
-            break;
-        case 2:
-            kernel = &HistogramTask::kernelP1U2;
-            break;
-        case 1:
-            kernel = &HistogramTask::kernelP1U1;
-            break;
-        default:
-            ALOGE("Bad vector size %zd", mVectorSize);
-            return;
-    }
-
-    int* sums = &mSums[256 * paddedSize(mVectorSize) * threadIndex];
-
-    for (size_t y = startY; y < endY; y++) {
-        const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize);
-        std::invoke(kernel, this, inPtr, sums, startX, endX);
-    }
-}
-
-void HistogramTask::kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
-    for (uint32_t x = xstart; x < xend; x++) {
-        sums[(in[0] << 2)]++;
-        sums[(in[1] << 2) + 1]++;
-        sums[(in[2] << 2) + 2]++;
-        sums[(in[3] << 2) + 3]++;
-        in += 4;
-    }
-}
-
-void HistogramTask::kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
-    for (uint32_t x = xstart; x < xend; x++) {
-        sums[(in[0] << 2)]++;
-        sums[(in[1] << 2) + 1]++;
-        sums[(in[2] << 2) + 2]++;
-        in += 4;
-    }
-}
-
-void HistogramTask::kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
-    for (uint32_t x = xstart; x < xend; x++) {
-        sums[(in[0] << 1)]++;
-        sums[(in[1] << 1) + 1]++;
-        in += 2;
-    }
-}
-
-void HistogramTask::kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
-    for (uint32_t x = xstart; x < xend; x++) {
-        sums[in[0]]++;
-        in++;
-    }
-}
-
-void HistogramTask::collateSums(int* out) {
-    for (uint32_t ct = 0; ct < (256 * paddedSize(mVectorSize)); ct++) {
-        out[ct] = mSums[ct];
-        for (uint32_t t = 1; t < mThreadCount; t++) {
-            out[ct] += mSums[ct + (256 * paddedSize(mVectorSize) * t)];
-        }
-    }
-}
-
-HistogramDotTask::HistogramDotTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize,
-                                   uint32_t threadCount, const float* coefficients,
-                                   const Restriction* restriction)
-    : Task{sizeX, sizeY, vectorSize, true, restriction}, mIn{in}, mSums(256 * threadCount, 0) {
-    mThreadCount = threadCount;
-
-    if (coefficients == nullptr) {
-        mDot[0] = 0.299f;
-        mDot[1] = 0.587f;
-        mDot[2] = 0.114f;
-        mDot[3] = 0;
-    } else {
-        memcpy(mDot, coefficients, 16);
-    }
-    mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
-    mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
-    mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
-    mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
-}
-
-void HistogramDotTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                                   size_t endY) {
-    typedef void (HistogramDotTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t);
-
-    KernelFunction kernel;
-    switch (mVectorSize) {
-        case 4:
-            kernel = &HistogramDotTask::kernelP1L4;
-            break;
-        case 3:
-            kernel = &HistogramDotTask::kernelP1L3;
-            break;
-        case 2:
-            kernel = &HistogramDotTask::kernelP1L2;
-            break;
-        case 1:
-            kernel = &HistogramDotTask::kernelP1L1;
-            break;
-        default:
-            ALOGI("Bad vector size %zd", mVectorSize);
-            return;
-    }
-
-    int* sums = &mSums[256 * threadIndex];
-
-    for (size_t y = startY; y < endY; y++) {
-        const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize);
-        std::invoke(kernel, this, inPtr, sums, startX, endX);
-    }
-}
-
-void HistogramDotTask::kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
-    for (uint32_t x = xstart; x < xend; x++) {
-        int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]) + (mDotI[3] * in[3]);
-        sums[(t + 0x7f) >> 8]++;
-        in += 4;
-    }
-}
-
-void HistogramDotTask::kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
-    for (uint32_t x = xstart; x < xend; x++) {
-        int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]);
-        sums[(t + 0x7f) >> 8]++;
-        in += 4;
-    }
-}
-
-void HistogramDotTask::kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
-    for (uint32_t x = xstart; x < xend; x++) {
-        int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]);
-        sums[(t + 0x7f) >> 8]++;
-        in += 2;
-    }
-}
-
-void HistogramDotTask::kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
-    for (uint32_t x = xstart; x < xend; x++) {
-        int t = (mDotI[0] * in[0]);
-        sums[(t + 0x7f) >> 8]++;
-        in++;
-    }
-}
-
-void HistogramDotTask::collateSums(int* out) {
-    for (uint32_t ct = 0; ct < 256; ct++) {
-        out[ct] = mSums[ct];
-        for (uint32_t t = 1; t < mThreadCount; t++) {
-            out[ct] += mSums[ct + (256 * t)];
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////
-
-void RenderScriptToolkit::histogram(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY,
-                                    size_t vectorSize, const Restriction* restriction) {
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
-    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
-        return;
-    }
-    if (vectorSize < 1 || vectorSize > 4) {
-        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
-        return;
-    }
-#endif
-
-    HistogramTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), restriction);
-    processor->doTask(&task);
-    task.collateSums(out);
-}
-
-void RenderScriptToolkit::histogramDot(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY,
-                                       size_t vectorSize, const float* coefficients,
-                                       const Restriction* restriction) {
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
-    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
-        return;
-    }
-    if (vectorSize < 1 || vectorSize > 4) {
-        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
-        return;
-    }
-    if (coefficients != nullptr) {
-        float sum = 0.0f;
-        for (size_t i = 0; i < vectorSize; i++) {
-            if (coefficients[i] < 0.0f) {
-                ALOGE("histogramDot coefficients should not be negative. Coefficient %zu was %f.",
-                      i, coefficients[i]);
-                return;
-            }
-            sum += coefficients[i];
-        }
-        if (sum > 1.0f) {
-            ALOGE("histogramDot coefficients should add to 1 or less. Their sum is %f.", sum);
-            return;
-        }
-    }
-#endif
-
-    HistogramDotTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(),
-                          coefficients, restriction);
-    processor->doTask(&task);
-    task.collateSums(out);
-}
-
-}  // namespace renderscript
-}  // namespace android

diff --git a/toolkit/JniEntryPoints.cpp b/toolkit/JniEntryPoints.cpp
deleted file mode 100644
index 3bf5911..0000000
--- a/toolkit/JniEntryPoints.cpp
+++ /dev/null

@@ -1,480 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <android/bitmap.h>
-#include <assert.h>
-#include <jni.h>
-
-#include "RenderScriptToolkit.h"
-#include "Utils.h"
-
-#define LOG_TAG "renderscript.toolkit.JniEntryPoints"
-
-using namespace android::renderscript;
-
-/**
- * I compared using env->GetPrimitiveArrayCritical vs. env->GetByteArrayElements to get access
- * to the underlying data. On Pixel 4, it's actually faster to not use critical. The code is left
- * here if you want to experiment. Note that USE_CRITICAL could block the garbage collector.
- */
-// #define USE_CRITICAL
-
-class ByteArrayGuard {
-   private:
-    JNIEnv* env;
-    jbyteArray array;
-    jbyte* data;
-
-   public:
-    ByteArrayGuard(JNIEnv* env, jbyteArray array) : env{env}, array{array} {
-#ifdef USE_CRITICAL
-        data = reinterpret_cast<jbyte*>(env->GetPrimitiveArrayCritical(array, nullptr));
-#else
-        data = env->GetByteArrayElements(array, nullptr);
-#endif
-    }
-    ~ByteArrayGuard() {
-#ifdef USE_CRITICAL
-        env->ReleasePrimitiveArrayCritical(array, data, 0);
-#else
-        env->ReleaseByteArrayElements(array, data, 0);
-#endif
-    }
-    uint8_t* get() { return reinterpret_cast<uint8_t*>(data); }
-};
-
-class IntArrayGuard {
-   private:
-    JNIEnv* env;
-    jintArray array;
-    jint* data;
-
-   public:
-    IntArrayGuard(JNIEnv* env, jintArray array) : env{env}, array{array} {
-#ifdef USE_CRITICAL
-        data = reinterpret_cast<jint*>(env->GetPrimitiveArrayCritical(array, nullptr));
-#else
-        data = env->GetIntArrayElements(array, nullptr);
-#endif
-    }
-    ~IntArrayGuard() {
-#ifdef USE_CRITICAL
-        env->ReleasePrimitiveArrayCritical(array, data, 0);
-#else
-        env->ReleaseIntArrayElements(array, data, 0);
-#endif
-    }
-    int* get() { return reinterpret_cast<int*>(data); }
-};
-
-class FloatArrayGuard {
-   private:
-    JNIEnv* env;
-    jfloatArray array;
-    jfloat* data;
-
-   public:
-    FloatArrayGuard(JNIEnv* env, jfloatArray array) : env{env}, array{array} {
-#ifdef USE_CRITICAL
-        data = reinterpret_cast<jfloat*>(env->GetPrimitiveArrayCritical(array, nullptr));
-#else
-        data = env->GetFloatArrayElements(array, nullptr);
-#endif
-    }
-    ~FloatArrayGuard() {
-#ifdef USE_CRITICAL
-        env->ReleasePrimitiveArrayCritical(array, data, 0);
-#else
-        env->ReleaseFloatArrayElements(array, data, 0);
-#endif
-    }
-    float* get() { return reinterpret_cast<float*>(data); }
-};
-
-class BitmapGuard {
-   private:
-    JNIEnv* env;
-    jobject bitmap;
-    AndroidBitmapInfo info;
-    int bytesPerPixel;
-    void* bytes;
-    bool valid;
-
-   public:
-    BitmapGuard(JNIEnv* env, jobject jBitmap) : env{env}, bitmap{jBitmap}, bytes{nullptr} {
-        valid = false;
-        if (AndroidBitmap_getInfo(env, bitmap, &info) != ANDROID_BITMAP_RESULT_SUCCESS) {
-            ALOGE("AndroidBitmap_getInfo failed");
-            return;
-        }
-        if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888 &&
-            info.format != ANDROID_BITMAP_FORMAT_A_8) {
-            ALOGE("AndroidBitmap in the wrong format");
-            return;
-        }
-        bytesPerPixel = info.stride / info.width;
-        if (bytesPerPixel != 1 && bytesPerPixel != 4) {
-            ALOGE("Expected a vector size of 1 or 4. Got %d. Extra padding per line not currently "
-                  "supported",
-                  bytesPerPixel);
-            return;
-        }
-        if (AndroidBitmap_lockPixels(env, bitmap, &bytes) != ANDROID_BITMAP_RESULT_SUCCESS) {
-            ALOGE("AndroidBitmap_lockPixels failed");
-            return;
-        }
-        valid = true;
-    }
-    ~BitmapGuard() {
-        if (valid) {
-            AndroidBitmap_unlockPixels(env, bitmap);
-        }
-    }
-    uint8_t* get() const {
-        assert(valid);
-        return reinterpret_cast<uint8_t*>(bytes);
-    }
-    int width() const { return info.width; }
-    int height() const { return info.height; }
-    int vectorSize() const { return bytesPerPixel; }
-};
-
-/**
- * Copies the content of Kotlin Range2d object into the equivalent C++ struct.
- */
-class RestrictionParameter {
-   private:
-    bool isNull;
-    Restriction restriction;
-
-   public:
-    RestrictionParameter(JNIEnv* env, jobject jRestriction) : isNull{jRestriction == nullptr} {
-        if (isNull) {
-            return;
-        }
-        /* TODO Measure how long FindClass and related functions take. Consider passing the
-         * four values instead. This would also require setting the default when Range2D is null.
-         */
-        jclass restrictionClass = env->FindClass("android/renderscript/toolkit/Range2d");
-        if (restrictionClass == nullptr) {
-            ALOGE("RenderScriptToolit. Internal error. Could not find the Kotlin Range2d class.");
-            isNull = true;
-            return;
-        }
-        jfieldID startXId = env->GetFieldID(restrictionClass, "startX", "I");
-        jfieldID startYId = env->GetFieldID(restrictionClass, "startY", "I");
-        jfieldID endXId = env->GetFieldID(restrictionClass, "endX", "I");
-        jfieldID endYId = env->GetFieldID(restrictionClass, "endY", "I");
-        restriction.startX = env->GetIntField(jRestriction, startXId);
-        restriction.startY = env->GetIntField(jRestriction, startYId);
-        restriction.endX = env->GetIntField(jRestriction, endXId);
-        restriction.endY = env->GetIntField(jRestriction, endYId);
-    }
-    Restriction* get() { return isNull ? nullptr : &restriction; }
-};
-
-extern "C" JNIEXPORT jlong JNICALL
-Java_android_renderscript_toolkit_Toolkit_createNative(JNIEnv* /*env*/, jobject /*thiz*/) {
-    return reinterpret_cast<jlong>(new RenderScriptToolkit());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_destroyNative(
-        JNIEnv* /*env*/, jobject /*thiz*/, jlong native_handle) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    delete toolkit;
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeBlend(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jint jmode, jbyteArray source_array,
-        jbyteArray dest_array, jint size_x, jint size_y, jobject restriction) {
-    auto toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    auto mode = static_cast<RenderScriptToolkit::BlendingMode>(jmode);
-    RestrictionParameter restrict {env, restriction};
-    ByteArrayGuard source{env, source_array};
-    ByteArrayGuard dest{env, dest_array};
-
-    toolkit->blend(mode, source.get(), dest.get(), size_x, size_y, restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeBlendBitmap(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jint jmode, jobject source_bitmap,
-        jobject dest_bitmap, jobject restriction) {
-    auto toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    auto mode = static_cast<RenderScriptToolkit::BlendingMode>(jmode);
-    RestrictionParameter restrict {env, restriction};
-    BitmapGuard source{env, source_bitmap};
-    BitmapGuard dest{env, dest_bitmap};
-
-    toolkit->blend(mode, source.get(), dest.get(), source.width(), source.height(), restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeBlur(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array, jint vectorSize,
-        jint size_x, jint size_y, jint radius, jbyteArray output_array, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    ByteArrayGuard input{env, input_array};
-    ByteArrayGuard output{env, output_array};
-
-    toolkit->blur(input.get(), output.get(), size_x, size_y, vectorSize, radius, restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeBlurBitmap(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
-        jobject output_bitmap, jint radius, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    BitmapGuard input{env, input_bitmap};
-    BitmapGuard output{env, output_bitmap};
-
-    toolkit->blur(input.get(), output.get(), input.width(), input.height(), input.vectorSize(),
-                  radius, restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeColorMatrix(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
-        jint input_vector_size, jint size_x, jint size_y, jbyteArray output_array,
-        jint output_vector_size, jfloatArray jmatrix, jfloatArray add_vector, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    ByteArrayGuard input{env, input_array};
-    ByteArrayGuard output{env, output_array};
-    FloatArrayGuard matrix{env, jmatrix};
-    FloatArrayGuard add{env, add_vector};
-
-    toolkit->colorMatrix(input.get(), output.get(), input_vector_size, output_vector_size, size_x,
-                         size_y, matrix.get(), add.get(), restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeColorMatrixBitmap(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
-        jobject output_bitmap, jfloatArray jmatrix, jfloatArray add_vector, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    BitmapGuard input{env, input_bitmap};
-    BitmapGuard output{env, output_bitmap};
-    FloatArrayGuard matrix{env, jmatrix};
-    FloatArrayGuard add{env, add_vector};
-
-    toolkit->colorMatrix(input.get(), output.get(), input.vectorSize(), output.vectorSize(),
-                         input.width(), input.height(), matrix.get(), add.get(), restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeConvolve(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array, jint vectorSize,
-        jint size_x, jint size_y, jbyteArray output_array, jfloatArray coefficients,
-        jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    ByteArrayGuard input{env, input_array};
-    ByteArrayGuard output{env, output_array};
-    FloatArrayGuard coeffs{env, coefficients};
-
-    switch (env->GetArrayLength(coefficients)) {
-        case 9:
-            toolkit->convolve3x3(input.get(), output.get(), vectorSize, size_x, size_y,
-                                 coeffs.get(), restrict.get());
-            break;
-        case 25:
-            toolkit->convolve5x5(input.get(), output.get(), vectorSize, size_x, size_y,
-                                 coeffs.get(), restrict.get());
-            break;
-    }
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeConvolveBitmap(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
-        jobject output_bitmap, jfloatArray coefficients, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    BitmapGuard input{env, input_bitmap};
-    BitmapGuard output{env, output_bitmap};
-    FloatArrayGuard coeffs{env, coefficients};
-
-    switch (env->GetArrayLength(coefficients)) {
-        case 9:
-            toolkit->convolve3x3(input.get(), output.get(), input.vectorSize(), input.width(),
-                                 input.height(), coeffs.get(), restrict.get());
-            break;
-        case 25:
-            toolkit->convolve5x5(input.get(), output.get(), input.vectorSize(), input.width(),
-                                 input.height(), coeffs.get(), restrict.get());
-            break;
-    }
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeHistogram(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
-        jint vector_size, jint size_x, jint size_y, jintArray output_array, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    ByteArrayGuard input{env, input_array};
-    IntArrayGuard output{env, output_array};
-
-    toolkit->histogram(input.get(), output.get(), size_x, size_y, vector_size, restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeHistogramBitmap(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
-        jintArray output_array, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    BitmapGuard input{env, input_bitmap};
-    IntArrayGuard output{env, output_array};
-
-    toolkit->histogram(input.get(), output.get(), input.width(), input.height(), input.vectorSize(),
-                       restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeHistogramDot(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
-        jint vector_size, jint size_x, jint size_y, jintArray output_array,
-        jfloatArray coefficients, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    ByteArrayGuard input{env, input_array};
-    IntArrayGuard output{env, output_array};
-    FloatArrayGuard coeffs{env, coefficients};
-
-    toolkit->histogramDot(input.get(), output.get(), size_x, size_y, vector_size, coeffs.get(),
-                          restrict.get());
-}
-
-extern "C" JNIEXPORT
-void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeHistogramDotBitmap(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
-        jintArray output_array, jfloatArray coefficients, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    BitmapGuard input{env, input_bitmap};
-    IntArrayGuard output{env, output_array};
-    FloatArrayGuard coeffs{env, coefficients};
-
-    toolkit->histogramDot(input.get(), output.get(), input.width(), input.height(),
-                          input.vectorSize(), coeffs.get(), restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeLut(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
-        jbyteArray output_array, jint size_x, jint size_y, jbyteArray red_table,
-        jbyteArray green_table, jbyteArray blue_table, jbyteArray alpha_table,
-        jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-
-    ByteArrayGuard input{env, input_array};
-    ByteArrayGuard output{env, output_array};
-    ByteArrayGuard red{env, red_table};
-    ByteArrayGuard green{env, green_table};
-    ByteArrayGuard blue{env, blue_table};
-    ByteArrayGuard alpha{env, alpha_table};
-
-    toolkit->lut(input.get(), output.get(), size_x, size_y, red.get(), green.get(), blue.get(),
-                 alpha.get(), restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeLutBitmap(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
-        jobject output_bitmap, jbyteArray red_table, jbyteArray green_table, jbyteArray blue_table,
-        jbyteArray alpha_table, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-
-    BitmapGuard input{env, input_bitmap};
-    BitmapGuard output{env, output_bitmap};
-    ByteArrayGuard red{env, red_table};
-    ByteArrayGuard green{env, green_table};
-    ByteArrayGuard blue{env, blue_table};
-    ByteArrayGuard alpha{env, alpha_table};
-
-    toolkit->lut(input.get(), output.get(), input.width(), input.height(), red.get(), green.get(),
-                 blue.get(), alpha.get(), restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeLut3d(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
-        jbyteArray output_array, jint size_x, jint size_y, jbyteArray cube_values, jint cubeSizeX,
-        jint cubeSizeY, jint cubeSizeZ, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    ByteArrayGuard input{env, input_array};
-    ByteArrayGuard output{env, output_array};
-    ByteArrayGuard cube{env, cube_values};
-
-    toolkit->lut3d(input.get(), output.get(), size_x, size_y, cube.get(), cubeSizeX, cubeSizeY,
-                   cubeSizeZ, restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeLut3dBitmap(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
-        jobject output_bitmap, jbyteArray cube_values, jint cubeSizeX, jint cubeSizeY,
-        jint cubeSizeZ, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    BitmapGuard input{env, input_bitmap};
-    BitmapGuard output{env, output_bitmap};
-    ByteArrayGuard cube{env, cube_values};
-
-    toolkit->lut3d(input.get(), output.get(), input.width(), input.height(), cube.get(), cubeSizeX,
-                   cubeSizeY, cubeSizeZ, restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeResize(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
-        jint vector_size, jint input_size_x, jint input_size_y, jbyteArray output_array,
-        jint output_size_x, jint output_size_y, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    ByteArrayGuard input{env, input_array};
-    ByteArrayGuard output{env, output_array};
-
-    toolkit->resize(input.get(), output.get(), input_size_x, input_size_y, vector_size,
-                    output_size_x, output_size_y, restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeResizeBitmap(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
-        jobject output_bitmap, jobject restriction) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    RestrictionParameter restrict {env, restriction};
-    BitmapGuard input{env, input_bitmap};
-    BitmapGuard output{env, output_bitmap};
-
-    toolkit->resize(input.get(), output.get(), input.width(), input.height(), input.vectorSize(),
-                    output.width(), output.height(), restrict.get());
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeYuvToRgb(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
-        jbyteArray output_array, jint size_x, jint size_y, jint format) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    ByteArrayGuard input{env, input_array};
-    ByteArrayGuard output{env, output_array};
-
-    toolkit->yuvToRgb(input.get(), output.get(), size_x, size_y,
-                      static_cast<RenderScriptToolkit::YuvFormat>(format));
-}
-
-extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeYuvToRgbBitmap(
-        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array, jint size_x,
-        jint size_y, jobject output_bitmap, jint format) {
-    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
-    BitmapGuard output{env, output_bitmap};
-    ByteArrayGuard input{env, input_array};
-
-    toolkit->yuvToRgb(input.get(), output.get(), size_x, size_y,
-                      static_cast<RenderScriptToolkit::YuvFormat>(format));
-}

diff --git a/toolkit/Lut.cpp b/toolkit/Lut.cpp
deleted file mode 100644
index 4ac5cdc..0000000
--- a/toolkit/Lut.cpp
+++ /dev/null

@@ -1,82 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-
-#include "RenderScriptToolkit.h"
-#include "TaskProcessor.h"
-#include "Utils.h"
-
-#define LOG_TAG "renderscript.toolkit.Lut"
-
-namespace android {
-namespace renderscript {
-
-class LutTask : public Task {
-    const uchar4* mIn;
-    uchar4* mOut;
-    const uchar* mRedTable;
-    const uchar* mGreenTable;
-    const uchar* mBlueTable;
-    const uchar* mAlphaTable;
-
-    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
-    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                             size_t endY) override;
-
-   public:
-    LutTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY, const uint8_t* red,
-            const uint8_t* green, const uint8_t* blue, const uint8_t* alpha,
-            const Restriction* restriction)
-        : Task{sizeX, sizeY, 4, true, restriction},
-          mIn{reinterpret_cast<const uchar4*>(input)},
-          mOut{reinterpret_cast<uchar4*>(output)},
-          mRedTable{red},
-          mGreenTable{green},
-          mBlueTable{blue},
-          mAlphaTable{alpha} {}
-};
-
-void LutTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
-                          size_t endY) {
-    for (size_t y = startY; y < endY; y++) {
-        size_t offset = mSizeX * y + startX;
-        const uchar4* in = mIn + offset;
-        uchar4* out = mOut + offset;
-        for (size_t x = startX; x < endX; x++) {
-            auto v = *in;
-            *out = uchar4{mRedTable[v.x], mGreenTable[v.y], mBlueTable[v.z], mAlphaTable[v.w]};
-            in++;
-            out++;
-        }
-    }
-}
-
-void RenderScriptToolkit::lut(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
-                              const uint8_t* red, const uint8_t* green, const uint8_t* blue,
-                              const uint8_t* alpha, const Restriction* restriction) {
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
-    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
-        return;
-    }
-#endif
-
-    LutTask task(input, output, sizeX, sizeY, red, green, blue, alpha, restriction);
-    processor->doTask(&task);
-}
-
-}  // namespace renderscript
-}  // namespace android

diff --git a/toolkit/Lut3d.cpp b/toolkit/Lut3d.cpp
deleted file mode 100644
index f8a7d61..0000000
--- a/toolkit/Lut3d.cpp
+++ /dev/null

@@ -1,181 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-
-#include "RenderScriptToolkit.h"
-#include "TaskProcessor.h"
-#include "Utils.h"
-
-namespace android {
-namespace renderscript {
-
-#define LOG_TAG "renderscript.toolkit.Lut3d"
-
-/**
- * Converts a RGBA buffer using a 3D cube.
- */
-class Lut3dTask : public Task {
-    // The input array we're transforming.
-    const uchar4* mIn;
-    // Where we'll store the transformed result.
-    uchar4* mOut;
-    // The size of each of the three cube dimensions. We don't make use of the last value.
-    int4 mCubeDimension;
-    // The translation cube, in row major format.
-    const uchar* mCubeTable;
-
-    /**
-     * Converts a subset of a line of the 2D buffer.
-     *
-     * @param in The start of the data to transform.
-     * @param out Where to store the result.
-     * @param length The number of 4-byte vectors to transform.
-     */
-    void kernel(const uchar4* in, uchar4* out, uint32_t length);
-
-    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
-    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                             size_t endY) override;
-
-   public:
-    Lut3dTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
-              const uint8_t* cube, int cubeSizeX, int cubeSizeY, int cubeSizeZ,
-              const Restriction* restriction)
-        : Task{sizeX, sizeY, 4, true, restriction},
-          mIn{reinterpret_cast<const uchar4*>(input)},
-          mOut{reinterpret_cast<uchar4*>(output)},
-          mCubeDimension{cubeSizeX, cubeSizeY, cubeSizeZ, 0},
-          mCubeTable{cube} {}
-};
-
-extern "C" void rsdIntrinsic3DLUT_K(void* dst, void const* in, size_t count, void const* lut,
-                                    int32_t pitchy, int32_t pitchz, int dimx, int dimy, int dimz);
-
-void Lut3dTask::kernel(const uchar4* in, uchar4* out, uint32_t length) {
-    uint32_t x1 = 0;
-    uint32_t x2 = length;
-
-    const uchar* bp = mCubeTable;
-
-    int4 dims = mCubeDimension - 1;
-
-    const float4 m = (float4)(1.f / 255.f) * convert<float4>(dims);
-    const int4 coordMul = convert<int4>(m * (float4)0x8000);
-    const size_t stride_y = mCubeDimension.x * 4;
-    const size_t stride_z = stride_y * mCubeDimension.y;
-
-    // ALOGE("strides %zu %zu", stride_y, stride_z);
-
-#if defined(ARCH_ARM_USE_INTRINSICS)
-    if (mUsesSimd) {
-        int32_t len = x2 - x1;
-        if (len > 0) {
-            rsdIntrinsic3DLUT_K(out, in, len, bp, stride_y, stride_z, dims.x, dims.y, dims.z);
-            x1 += len;
-            out += len;
-            in += len;
-        }
-    }
-#endif
-
-    while (x1 < x2) {
-        int4 baseCoord = convert<int4>(*in) * coordMul;
-        int4 coord1 = baseCoord >> (int4)15;
-        // int4 coord2 = min(coord1 + 1, gDims - 1);
-
-        int4 weight2 = baseCoord & 0x7fff;
-        int4 weight1 = (int4)0x8000 - weight2;
-
-        // ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
-        const uchar* bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z);
-        const uchar4* pt_00 = (const uchar4*)&bp2[0];
-        const uchar4* pt_10 = (const uchar4*)&bp2[stride_y];
-        const uchar4* pt_01 = (const uchar4*)&bp2[stride_z];
-        const uchar4* pt_11 = (const uchar4*)&bp2[stride_y + stride_z];
-
-        uint4 v000 = convert<uint4>(pt_00[0]);
-        uint4 v100 = convert<uint4>(pt_00[1]);
-        uint4 v010 = convert<uint4>(pt_10[0]);
-        uint4 v110 = convert<uint4>(pt_10[1]);
-        uint4 v001 = convert<uint4>(pt_01[0]);
-        uint4 v101 = convert<uint4>(pt_01[1]);
-        uint4 v011 = convert<uint4>(pt_11[0]);
-        uint4 v111 = convert<uint4>(pt_11[1]);
-
-        uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7;
-        uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7;
-        uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7;
-        uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7;
-
-        uint4 z0 = ((yz00 * weight1.y) + (yz10 * weight2.y)) >> (int4)15;
-        uint4 z1 = ((yz01 * weight1.y) + (yz11 * weight2.y)) >> (int4)15;
-
-        uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (int4)15;
-        uint4 v2 = (v + 0x7f) >> (int4)8;
-
-        uchar4 ret = convert<uchar4>(v2);
-        ret.w = in->w;
-
-#if 0
-        if (!x1) {
-            ALOGE("in          %08x %08x %08x %08x", in->r, in->g, in->b, in->a);
-            ALOGE("baseCoord   %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z,
-                  baseCoord.w);
-            ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
-            ALOGE("weight1     %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w);
-            ALOGE("weight2     %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w);
-
-            ALOGE("v000        %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w);
-            ALOGE("v100        %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w);
-            ALOGE("yz00        %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w);
-            ALOGE("z0          %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w);
-
-            ALOGE("v           %08x %08x %08x %08x", v.x, v.y, v.z, v.w);
-            ALOGE("v2          %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w);
-        }
-#endif
-        *out = ret;
-
-        in++;
-        out++;
-        x1++;
-    }
-}
-
-void Lut3dTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
-                            size_t endY) {
-    for (size_t y = startY; y < endY; y++) {
-        size_t offset = mSizeX * y + startX;
-        kernel(mIn + offset, mOut + offset, endX - startX);
-    }
-}
-
-void RenderScriptToolkit::lut3d(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
-                                const uint8_t* cube, size_t cubeSizeX, size_t cubeSizeY,
-                                size_t cubeSizeZ, const Restriction* restriction) {
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
-    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
-        return;
-    }
-#endif
-
-    Lut3dTask task(input, output, sizeX, sizeY, cube, cubeSizeX, cubeSizeY, cubeSizeZ, restriction);
-    processor->doTask(&task);
-}
-
-}  // namespace renderscript
-}  // namespace android

diff --git a/toolkit/Lut3d_advsimd.S b/toolkit/Lut3d_advsimd.S
deleted file mode 100644
index edcb038..0000000
--- a/toolkit/Lut3d_advsimd.S
+++ /dev/null

@@ -1,250 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
-#define END(f) .size f, .-f;
-
-
-.macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1
-
-            smov        x6, \src0
-            smov        x7, \src1
-
-            add         x6, x6, x3
-            add         x7, x7, x3
-
-            ld1         {v16.2s}, [x6], x4
-            ld1         {v17.2s}, [x7], x4
-
-            ld1         {v18.2s}, [x6], x5
-            ld1         {v19.2s}, [x7], x5
-
-            dup         v8.8b, \yr0
-            dup         v9.8b, \yr1
-            /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
-            zip1        v12.16b, v5.16b, v16.16b
-            zip1        v13.16b, v5.16b, v17.16b
-            umlsl       v12.8h, v16.8b, v8.8b
-            umlsl       v13.8h, v17.8b, v9.8b
-            umlal       v12.8h, v18.8b, v8.8b
-            umlal       v13.8h, v19.8b, v9.8b
-
-            ld1         {v18.2s}, [x6]
-            ld1         {v19.2s}, [x7]
-
-            sub         x6, x6, x4
-            sub         x7, x7, x4
-
-            ld1         {v16.2s}, [x6]
-            ld1         {v17.2s}, [x7]
-
-            /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
-            zip1        v14.16b, v5.16b, v16.16b
-            zip1        v15.16b, v5.16b, v17.16b
-            umlsl       v14.8h, v16.8b, v8.8b
-            umlsl       v15.8h, v17.8b, v9.8b
-            umlal       v14.8h, v18.8b, v8.8b
-            umlal       v15.8h, v19.8b, v9.8b
-
-            /* Z interpolate, lane 0 v12/v14 -> v10 */
-            ushll       v8.4s, v12.4h, #8
-            ushll2      v9.4s, v12.8h, #8
-            umlsl       v8.4s, v12.4h, \zr0
-            umlsl2      v9.4s, v12.8h, \zr0
-            umlal       v8.4s, v14.4h, \zr0
-            umlal2      v9.4s, v14.8h, \zr0
-            rshrn       v10.4h, v8.4s, #8
-            rshrn2      v10.8h, v9.4s, #8
-
-            /* Z interpolate, lane 1 v13/v15 -> v11 */
-            ushll       v8.4s, v13.4h, #8
-            ushll2      v9.4s, v13.8h, #8
-            umlsl       v8.4s, v13.4h, \zr1
-            umlsl2      v9.4s, v13.8h, \zr1
-            umlal       v8.4s, v15.4h, \zr1
-            umlal2      v9.4s, v15.8h, \zr1
-            rshrn       v11.4h, v8.4s, #8
-            rshrn2      v11.8h, v9.4s, #8
-
-            /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
-            ushll       v8.4s, v10.4h, #8
-            ushll       v9.4s, v11.4h, #8
-            umlsl       v8.4s, v10.4h, \xr0
-            umlsl       v9.4s, v11.4h, \xr1
-            umlal2      v8.4s, v10.8h, \xr0
-            umlal2      v9.4s, v11.8h, \xr1
-            shrn        v14.4h, v8.4s, #8
-            shrn2       v14.8h, v9.4s, #8
-
-            /* pack lanes 0-1 -> v6 */
-.ifc \dst, v20.16b
-            uqrshrn2    \dst, v14.8h, #8
-.else ; .ifc \dst, v21.16b
-            uqrshrn2    \dst, v14.8h, #8
-.else
-            uqrshrn     \dst, v14.8h, #8
-.endif ; .endif
-.endm
-
-/* void rsdIntrinsic3DLUT_K(
- *          void *dst,          // x0
- *          void const *in,     // x1
- *          size_t count,       // x2
- *          void const *lut,    // x3
- *          int32_t pitchy,     // w4
- *          int32_t pitchz,     // w5
- *          int dimx,           // w6
- *          int dimy,           // w7
- *          int dimz);          // [sp]
- */
-ENTRY(rsdIntrinsic3DLUT_K)
-            ldr         w8, [sp]
-            stp         d8, d9, [sp, #-64]!
-            stp         d10, d11, [sp, #16]
-            stp         d12, d13, [sp, #32]
-            stp         d14, d15, [sp, #48]
-            movi        v4.8b, #1
-            ins         v4.h[0], w6
-            ins         v4.h[1], w7
-            ins         v4.h[2], w8
-            ins         v4.s[2], w4
-            ins         v4.s[3], w5
-            movi        v5.16b, #0
-
-            subs        x2, x2, #8
-            bge         2f
-            cmn         x2, #8    // same as cmp x2, #-8
-            ble         9f
-            b           4f
-
-            .align 6
-1:          st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
-/* x0  = dst
- * x1  = src
- * x2  = count
- * x3  = lut
- * x4  = pitchy
- * x5  = pitchz
- * x6 = offset0
- * x7 = offset1
- */
-2:          ld4         {v0.8b-v3.8b}, [x1], #32
-/* v0,v1,v2,v3 source data
- * v4 dimensions and pitches
- */
-3:          uxtl        v0.8h, v0.8b
-            uxtl        v1.8h, v1.8b
-            uxtl        v2.8h, v2.8b
-            mul         v0.8h, v0.8h, v4.h[0]
-            mul         v1.8h, v1.8h, v4.h[1]
-            mul         v2.8h, v2.8h, v4.h[2]
-
-/* ursra below would be more accurate, but this can result in a dim.0 case
- * where we try to read from the limit of the array and the limit +1 to
- * interpolate, even though the fractional component is zero.  Strictly this is
- * correct, except for the llegal access problem.
- */
-            usra        v0.8h, v0.8h, #8
-            usra        v1.8h, v1.8h, #8
-            usra        v2.8h, v2.8h, #8
-
-            ushr        v12.8h, v0.8h, #8
-            ushr        v13.8h, v1.8h, #8
-            ushr        v14.8h, v2.8h, #8
-            bic         v0.8h, #0xff, LSL #8
-            xtn         v1.8b, v1.8h
-            bic         v2.8h, #0xff, LSL #8
-
-/* v0.8h,v1.8b,v2.hb fractional offset
- * v12.8h,v13.8h,v14.8h integer offset
- */
-
-            ushll       v6.4s, v12.4h, #2
-            ushll2      v7.4s, v12.8h, #2
-            uxtl        v8.4s, v13.4h
-            uxtl2       v9.4s, v13.8h
-            uxtl        v10.4s, v14.4h
-            uxtl2       v11.4s, v14.8h
-            mla         v6.4s, v8.4s,  v4.s[2]
-            mla         v7.4s, v9.4s,  v4.s[2]
-            mla         v6.4s, v10.4s, v4.s[3]
-            mla         v7.4s, v11.4s, v4.s[3]
-
-/* v6,v7 list of table offsets */
-
-        /* lanes 0 and 1 */
-            lanepair    dst=v20.8b,  src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]
-
-        /* lanes 2 and 3 */
-            lanepair    dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]
-
-        /* lanes 4 and 5 */
-            lanepair    dst=v21.8b,  src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]
-
-        /* lanes 6 and 7 */
-            lanepair    dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]
-
-            uzp1        v6.16b, v20.16b, v21.16b
-            uzp2        v7.16b, v20.16b, v21.16b
-            uzp1        v20.16b, v6.16b, v7.16b
-            uzp2        v22.16b, v6.16b, v7.16b
-            mov         v21.d[0], v20.d[1]
-
-            subs        x2, x2, #8
-            mov         v23.8b, v3.8b
-
-            bge         1b
-
-            cmn         x2, #8    // same as cmp x2, #-8
-            blt         1f
-
-            st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
-            beq         9f
-
-            /* fill the vector  with a safe value */
-4:          ld4r        {v0.8b-v3.8b}, [x1]
-            tbz         x2, #2, 2f
-            ld4         {v0.b-v3.b}[0], [x1], #4
-            ld4         {v0.b-v3.b}[1], [x1], #4
-            ld4         {v0.b-v3.b}[2], [x1], #4
-            ld4         {v0.b-v3.b}[3], [x1], #4
-2:          tbz         x2, #1, 2f
-            ld4         {v0.b-v3.b}[4], [x1], #4
-            ld4         {v0.b-v3.b}[5], [x1], #4
-2:          tbz         x2, #0, 2f
-            ld4         {v0.b-v3.b}[6], [x1], #4
-2:          b           3b
-
-1:          tst         x2, #4
-            beq         2f
-            st4         {v20.b-v23.b}[0], [x0], #4
-            st4         {v20.b-v23.b}[1], [x0], #4
-            st4         {v20.b-v23.b}[2], [x0], #4
-            st4         {v20.b-v23.b}[3], [x0], #4
-2:          tst         x2, #2
-            beq         2f
-            st4         {v20.b-v23.b}[4], [x0], #4
-            st4         {v20.b-v23.b}[5], [x0], #4
-2:          tst         x2, #1
-            beq         9f
-            st4         {v20.b-v23.b}[6], [x0], #4
-
-9:          ldp         d14, d15, [sp, #48]
-            ldp         d12, d13, [sp, #32]
-            ldp         d10, d11, [sp, #16]
-            ldp         d8, d9, [sp], #64
-            ret
-END(rsdIntrinsic3DLUT_K)

diff --git a/toolkit/Lut3d_neon.S b/toolkit/Lut3d_neon.S
deleted file mode 100644
index 9590f9c..0000000
--- a/toolkit/Lut3d_neon.S
+++ /dev/null

@@ -1,256 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
-#define END(f) .fnend; .size f, .-f;
-
-.eabi_attribute 25,1 @Tag_ABI_align8_preserved
-.arm
-
-.macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1
-
-            vmov        r6, r7, \src
-
-            add         r6, r6, r3
-            add         r7, r7, r3
-
-            vld1.u8     d16, [r6], r4
-            vld1.u8     d17, [r7], r4
-
-            vld1.u8     d18, [r6], r5
-            vld1.u8     d19, [r7], r5
-
-            vdup.u8     d6, \yr0
-            vdup.u8     d7, \yr1
-            /* Y interpolate, front, lanes 0 and 1 -> q12 and q13 */
-            vshll.u8    q12, d16, #8
-            vshll.u8    q13, d17, #8
-            vmlsl.u8    q12, d16, d6
-            vmlsl.u8    q13, d17, d7
-            vmlal.u8    q12, d18, d6
-            vmlal.u8    q13, d19, d7
-
-            vld1.u8     d18, [r6]
-            vld1.u8     d19, [r7]
-
-            sub         r6, r6, r4
-            sub         r7, r7, r4
-
-            vld1.u8     d16, [r6]
-            vld1.u8     d17, [r7]
-
-            /* Y interpolate, rear, lanes 0 and 1 -> q14 and q15 */
-            vshll.u8    q14, d16, #8
-            vshll.u8    q15, d17, #8
-            vmlsl.u8    q14, d16, d6
-            vmlsl.u8    q15, d17, d7
-            vmlal.u8    q14, d18, d6
-            vmlal.u8    q15, d19, d7
-
-            /* Z interpolate, lane 0 q12/q14 -> q10 */
-            vshll.u16   q8, d24, #8
-            vshll.u16   q9, d25, #8
-            vmlsl.u16   q8, d24, \zr0
-            vmlsl.u16   q9, d25, \zr0
-            vmlal.u16   q8, d28, \zr0
-            vmlal.u16   q9, d29, \zr0
-            vrshrn.u32  d20, q8, #8
-            vrshrn.u32  d21, q9, #8
-
-            /* Z interpolate, lane 1 q13/q15 -> q11 */
-            vshll.u16   q8, d26, #8
-            vshll.u16   q9, d27, #8
-            vmlsl.u16   q8, d26, \zr1
-            vmlsl.u16   q9, d27, \zr1
-            vmlal.u16   q8, d30, \zr1
-            vmlal.u16   q9, d31, \zr1
-            vrshrn.u32  d22, q8, #8
-            vrshrn.u32  d23, q9, #8
-
-            /* X interpolate, lanes 0 and 1 q10,q11 -> q14 */
-            vshll.u16   q8, d20, #8
-            vshll.u16   q9, d22, #8
-            vmlsl.u16   q8, d20, \xr0
-            vmlsl.u16   q9, d22, \xr1
-            vmlal.u16   q8, d21, \xr0
-            vmlal.u16   q9, d23, \xr1
-            vshrn.u32   d28, q8, #8
-            vshrn.u32   d29, q9, #8
-
-            /* pack lanes 0-1 -> d12 */
-            vqrshrn.u16  \dst, q14, #8
-.endm
-
-/* void rsdIntrinsic3DLUT_K(
- *          void *dst,          // r0
- *          void const *in,     // r1
- *          size_t count,       // r2
- *          void const *lut,    // r3
- *          int32_t pitchy,     // [sp]
- *          int32_t pitchz,     // [sp+#4]
- *          int dimx,           // [sp+#8]
- *          int dimy,           // [sp+#12]
- *          int dimz);          // [sp+#16]
- */
-ENTRY(rsdIntrinsic3DLUT_K)
-            push        {r4,r5,r6,r7}
-            ldr         r4, [sp, #16]
-            ldr         r5, [sp, #20]
-            ldr         r6, [sp, #24]
-            ldr         r7, [sp, #28]
-            ldr         r12, [sp, #32]
-            vpush       {d8-d15}
-
-            vmov.u8     d8, #1
-            vmov.u16    d8[0], r6
-            vmov.u16    d8[1], r7
-            vmov.u16    d8[2], r12
-            vmov        d9, r4, r5
-
-            subs        r2, #8
-            bge         2f
-            cmp         r2, #-8
-            ble         9f
-            b           4f
-
-            .align 6
-1:          vst4.u8     {d12,d13,d14,d15}, [r0]!
-/* r0  = dst
- * r1  = src
- * r2  = count
- * r3  = lut
- * r4  = pitchy
- * r5  = pitchz
- * r6 = offset0
- * r7 = offset1
- */
-2:          vld4.u8     {d0,d2,d4,d6}, [r1]!
-3:          vmov        d10, d6
-/* q0,q1,q2,q5 source data
- * q4 dimensions and pitches
- * q3, scratch register for scalar access
- */
-            vmov        q3, q4
-            vmovl.u8    q0, d0
-            vmovl.u8    q1, d2
-            vmovl.u8    q2, d4
-            vmul.u16    q0, q0, d6[0]
-            vmul.u16    q1, q1, d6[1]
-            vmul.u16    q2, q2, d6[2]
-
-/* vrsra.u16 below would be more accurate, but this can result in a dim.0 case
- * where we try to read from the limit of the array and the limit +1 to
- * interpolate, even though the fractional component is zero.  Strictly this is
- * correct, except for the llegal access problem.
- */
-            vsra.u16    q0, q0, #8
-            vsra.u16    q1, q1, #8
-            vsra.u16    q2, q2, #8
-
-            vshr.u16    q12, q0, #8
-            vshr.u16    q13, q1, #8
-            vshr.u16    q14, q2, #8
-
-            vbic.u16    q0, #0xff00
-            vmovn.u16   d2, q1
-            vbic.u16    q2, #0xff00
-
-/* q0,d2,q2 fractional offset
- * q12,q13,q14 integer offset
- */
-
-            vshll.u16   q6, d24, #2
-            vshll.u16   q7, d25, #2
-            vmovl.u16   q8, d26
-            vmovl.u16   q9, d27
-            vmovl.u16   q10, d28
-            vmovl.u16   q11, d29
-            vmla.s32    q6, q8,  d9[0]
-            vmla.s32    q7, q9,  d9[0]
-            vmla.s32    q6, q10, d9[1]
-            vmla.s32    q7, q11, d9[1]
-
-/* q6,q7 list of table offsets */
-
-        /* lanes 0 and 1 */
-            lanepair dst=d12, src=d12, xr0=d0[0], xr1=d0[1], yr0=d2[0], yr1=d2[1], zr0=d4[0], zr1=d4[1]
-
-        /* lanes 2 and 3 */
-            lanepair dst=d13, src=d13, xr0=d0[2], xr1=d0[3], yr0=d2[2], yr1=d2[3], zr0=d4[2], zr1=d4[3]
-
-        /* lanes 4 and 5 */
-            lanepair dst=d14, src=d14, xr0=d1[0], xr1=d1[1], yr0=d2[4], yr1=d2[5], zr0=d5[0], zr1=d5[1]
-
-        /* lanes 6 and 7 */
-            lanepair dst=d15, src=d15, xr0=d1[2], xr1=d1[3], yr0=d2[6], yr1=d2[7], zr0=d5[2], zr1=d5[3]
-
-            vuzp.u8     d12, d13
-            vuzp.u8     d14, d15
-            vuzp.u8     d12, d14
-            vuzp.u8     d13, d15
-
-            subs        r2, r2, #8
-            vmov.u8     d15, d10
-
-            bge         1b
-
-            cmp         r2, #-8
-            blt         1f
-
-            vst4.u8     {d12,d13,d14,d15}, [r0]!
-
-            beq         9f
-
-            /* fill the vector with a safe value */
-4:          vld1.u32    {d0[]}, [r1]
-            vmov        d2, d0
-            vmov        d4, d0
-            vmov        d6, d0
-            tst         r2, #4
-            beq         2f
-            vld1.u32    {d0}, [r1]!
-            vld1.u32    {d2}, [r1]!
-2:          tst         r2, #2
-            beq         2f
-            vld1.u32    {d4}, [r1]!
-2:          tst         r2, #1
-            beq         2f
-            vld1.u32    {d6[0]}, [r1]!
-2:          vuzp.8      d0, d2
-            vuzp.8      d4, d6
-            vuzp.8      d0, d4
-            vuzp.8      d2, d6
-            b           3b
-
-1:          vzip.8      d12, d14
-            vzip.8      d13, d15
-            vzip.8      d12, d13
-            vzip.8      d14, d15
-            tst         r2, #4
-            beq         2f
-            vst1.u32    {d12,d13}, [r0]!
-2:          tst         r2, #2
-            beq         2f
-            vst1.u32    {d14}, [r0]!
-2:          tst         r2, #1
-            beq         9f
-            vst1.u32    {d15[0]}, [r0]!
-
-9:          mov         r0, #0
-            vpop        {d8-d15}
-            pop         {r4,r5,r6,r7}
-            bx lr
-END(rsdIntrinsic3DLUT_K)

diff --git a/toolkit/README.txt b/toolkit/README.txt
deleted file mode 100644
index 4e08dc5..0000000
--- a/toolkit/README.txt
+++ /dev/null

@@ -1,9 +0,0 @@
-This directory will contain the standalone library meant to replace the RenderScript Intrinsics.
-
-The work in this directory is not complete.
-
-To make the review process manageable, a series of smaller CLs will be reviewed and submitted.
-
-While it is initially built with Soong, the end goal is to move this to github once the work
-has been completed. This is a staging area for the reviews.
-

diff --git a/toolkit/RenderScriptToolkit.cpp b/toolkit/RenderScriptToolkit.cpp
deleted file mode 100644
index f110317..0000000
--- a/toolkit/RenderScriptToolkit.cpp
+++ /dev/null

@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "RenderScriptToolkit.h"
-
-#include "TaskProcessor.h"
-
-#define LOG_TAG "renderscript.toolkit.RenderScriptToolkit"
-
-namespace android {
-namespace renderscript {
-
-// You will find the implementation of the various transformations in the correspondingly
-// named source file. E.g. RenderScriptToolkit::blur() is found in Blur.cpp.
-
-RenderScriptToolkit::RenderScriptToolkit(int numberOfThreads)
-    : processor{new TaskProcessor(numberOfThreads)} {}
-
-RenderScriptToolkit::~RenderScriptToolkit() {
-    // By defining the destructor here, we don't need to include TaskProcessor.h
-    // in RenderScriptToolkit.h.
-}
-
-}  // namespace renderscript
-}  // namespace android

diff --git a/toolkit/RenderScriptToolkit.h b/toolkit/RenderScriptToolkit.h
deleted file mode 100644
index fb33195..0000000
--- a/toolkit/RenderScriptToolkit.h
+++ /dev/null

@@ -1,540 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ANDROID_RENDERSCRIPT_TOOLKIT_TOOLKIT_H
-#define ANDROID_RENDERSCRIPT_TOOLKIT_TOOLKIT_H
-
-#include <cstdint>
-#include <memory>
-
-namespace android {
-namespace renderscript {
-
-class TaskProcessor;
-
-/**
- * Define a range of data to process.
- *
- * This class is used to restrict a Toolkit operation to a rectangular subset of the input
- * tensor.
- *
- * @property startX The index of the first value to be included on the X axis.
- * @property endX The index after the last value to be included on the X axis.
- * @property startY The index of the first value to be included on the Y axis.
- * @property endY The index after the last value to be included on the Y axis.
- */
-struct Restriction {
-    size_t startX;
-    size_t endX;
-    size_t startY;
-    size_t endY;
-};
-
-/**
- * A collection of high-performance graphic utility functions like blur and blend.
- *
- * This toolkit provides ten image manipulation functions: blend, blur, color matrix, convolve,
- * histogram, histogramDot, lut, lut3d, resize, and YUV to RGB. These functions execute
- * multithreaded on the CPU.
- *
- * These functions work over raw byte arrays. You'll need to specify the width and height of
- * the data to be processed, as well as the number of bytes per pixel. For most use cases,
- * this will be 4.
- *
- * You should instantiate the Toolkit once and reuse it throughout your application.
- * On instantiation, the Toolkit creates a thread pool that's used for processing all the functions.
- * You can limit the number of pool threads used by the Toolkit via the constructor. The pool
- * threads are destroyed once the Toolkit is destroyed, after any pending work is done.
- *
- * This library is thread safe. You can call methods from different pool threads. The functions will
- * execute sequentially.
- *
- * A Java/Kotlin Toolkit is available. It calls this library through JNI.
- *
- * This toolkit can be used as a replacement for most RenderScript Intrinsic functions. Compared
- * to RenderScript, it's simpler to use and more than twice as fast on the CPU. However RenderScript
- * Intrinsics allow more flexibility for the type of allocation supported. In particular, this
- * toolkit does not support allocations of floats.
- */
-class RenderScriptToolkit {
-    /** Each Toolkit method call is converted to a Task. The processor owns the thread pool. It
-     * tiles the tasks and schedule them over the pool threads.
-     */
-    std::unique_ptr<TaskProcessor> processor;
-
-   public:
-    /**
-     * Creates the pool threads that are used for processing the method calls.
-     */
-    RenderScriptToolkit(int numberOfThreads = 0);
-    /**
-     * Destroys the thread pool. This stops any in-progress work; the Toolkit methods called from
-     * other pool threads will return without having completed the work. Because of the undefined
-     * state of the output buffers, an application should avoid destroying the Toolkit if other pool
-     * threads are executing Toolkit methods.
-     */
-    ~RenderScriptToolkit();
-
-    /**
-     * Determines how a source buffer is blended into a destination buffer.
-     *
-     * See {@link RenderScriptToolkit::blend}.
-     *
-     * blend only works on 4 byte RGBA data. In the descriptions below, ".a" represents
-     * the alpha channel.
-     */
-    enum class BlendingMode {
-        /**
-         * dest = 0
-         *
-         * The destination is cleared, i.e. each pixel is set to (0, 0, 0, 0)
-         */
-        CLEAR = 0,
-        /**
-         * dest = src
-         *
-         * Sets each pixel of the destination to the corresponding one in the source.
-         */
-        SRC = 1,
-        /**
-         * dest = dest
-         *
-         * Leaves the destination untouched. This is a no-op.
-         */
-        DST = 2,
-        /**
-         * dest = src + dest * (1.0 - src.a)
-         */
-        SRC_OVER = 3,
-        /**
-         * dest = dest + src * (1.0 - dest.a)
-         */
-        DST_OVER = 4,
-        /**
-         * dest = src * dest.a
-         */
-        SRC_IN = 5,
-        /**
-         * dest = dest * src.a
-         */
-        DST_IN = 6,
-        /**
-         * dest = src * (1.0 - dest.a)
-         */
-        SRC_OUT = 7,
-        /**
-         * dest = dest * (1.0 - src.a)
-         */
-        DST_OUT = 8,
-        /**
-         * dest.rgb = src.rgb * dest.a + (1.0 - src.a) * dest.rgb, dest.a = dest.a
-         */
-        SRC_ATOP = 9,
-        /**
-         * dest = dest.rgb * src.a + (1.0 - dest.a) * src.rgb, dest.a = src.a
-         */
-        DST_ATOP = 10,
-        /**
-         * dest = {src.r ^ dest.r, src.g ^ dest.g, src.b ^ dest.b, src.a ^ dest.a}
-         *
-         * Note: this is NOT the Porter/Duff XOR mode; this is a bitwise xor.
-         */
-        XOR = 11,
-        /**
-         * dest = src * dest
-         */
-        MULTIPLY = 12,
-        /**
-         * dest = min(src + dest, 1.0)
-         */
-        ADD = 13,
-        /**
-         * dest = max(dest - src, 0.0)
-         */
-        SUBTRACT = 14
-    };
-
-    /**
-     * Blend a source buffer with the destination buffer.
-     *
-     * Blends a source buffer and a destination buffer, placing the result in the destination
-     * buffer. The blending is done pairwise between two corresponding RGBA values found in
-     * each buffer. The mode parameter specifies one of fifteen blending operations.
-     * See {@link BlendingMode}.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY.
-     *
-     * The source and destination buffers must have the same dimensions. Both buffers should be
-     * large enough for sizeX * sizeY * 4 bytes. The buffers have a row-major layout.
-     *
-     * @param mode The specific blending operation to do.
-     * @param source The RGBA input buffer.
-     * @param dest The destination buffer. Used for input and output.
-     * @param sizeX The width of both buffers, as a number of RGBA values.
-     * @param sizeY The height of both buffers, as a number of RGBA values.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     */
-    void blend(BlendingMode mode, const uint8_t* _Nonnull source, uint8_t* _Nonnull dst,
-               size_t sizeX, size_t sizeY, const Restriction* _Nullable restriction = nullptr);
-
-    /**
-     * Blur an image.
-     *
-     * Performs a Gaussian blur of the input image and stores the result in the out buffer.
-     *
-     * The radius determines which pixels are used to compute each blurred pixels. This Toolkit
-     * accepts values between 1 and 25. Larger values create a more blurred effect but also
-     * take longer to compute. When the radius extends past the edge, the edge pixel will
-     * be used as replacement for the pixel that's out off boundary.
-     *
-     * Each input pixel can either be represented by four bytes (RGBA format) or one byte
-     * for the less common blurring of alpha channel only image.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY.
-     *
-     * The input and output buffers must have the same dimensions. Both buffers should be
-     * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
-     *
-     * @param in The buffer of the image to be blurred.
-     * @param out The buffer that receives the blurred image.
-     * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
-     * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
-     * @param vectorSize Either 1 or 4, the number of bytes in each cell, i.e. A vs. RGBA.
-     * @param radius The radius of the pixels used to blur.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     */
-    void blur(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
-              size_t vectorSize, int radius, const Restriction* _Nullable restriction = nullptr);
-
-    /**
-     * Identity matrix that can be passed to the {@link RenderScriptToolkit::colorMatrix} method.
-     *
-     * Using this matrix will result in no change to the pixel through multiplication although
-     * the pixel value can still be modified by the add vector, or transformed to a different
-     * format.
-     */
-    static constexpr float kIdentityMatrix[] =  {
-            1.0f, 0.0f, 0.0f, 0.0f,
-            0.0f, 1.0f, 0.0f, 0.0f,
-            0.0f, 0.0f, 1.0f, 0.0f,
-            0.0f, 0.0f, 0.0f, 1.0f
-    };
-
-    /**
-     * Matrix to turn color pixels to a grey scale.
-     *
-     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert an
-     * image from color to greyscale.
-     */
-    static constexpr float kGreyScaleColorMatrix[] = {
-            0.299f, 0.299f, 0.299f, 0.0f,
-            0.587f, 0.587f, 0.587f, 0.0f,
-            0.114f, 0.114f, 0.114f, 0.0f,
-            0.0f,   0.0f,   0.0f,   1.0f
-    };
-
-    /**
-     * Matrix to convert RGB to YUV.
-     *
-     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
-     * first three bytes of each pixel from RGB to YUV. This leaves the last byte (the alpha
-     * channel) untouched.
-     *
-     * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
-     * by this method.
-     */
-    static constexpr float kRgbToYuvMatrix[] = {
-            0.299f, -0.14713f,  0.615f,   0.0f,
-            0.587f, -0.28886f, -0.51499f, 0.0f,
-            0.114f,  0.436f,   -0.10001f, 0.0f,
-            0.0f,    0.0f,      0.0f,     1.0f
-    };
-
-    /**
-     * Matrix to convert YUV to RGB.
-     *
-     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
-     * first three bytes of each pixel from YUV to RGB. This leaves the last byte (the alpha
-     * channel) untouched.
-     *
-     * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
-     * by this method. Use {@link RenderScriptToolkit::yuvToRgb} to convert these buffers.
-     */
-    static constexpr float kYuvToRgbMatrix[] = {
-            1.0f,      1.0f,     1.0f,     0.0f,
-            0.0f,     -0.39465f, 2.03211f, 0.0f,
-            1.13983f, -0.5806f,  0.0f,     0.0f,
-            0.0f,      0.0f,     0.0f,     1.0f
-    };
-
-    /**
-     * Transform an image using a color matrix.
-     *
-     * Converts a 2D array of vectors of unsigned bytes, multiplying each vectors by a 4x4 matrix
-     * and adding an optional vector.
-     *
-     * Each input vector is composed of 1-4 unsigned bytes. If less than 4 bytes, it's extended to
-     * 4, padding with zeroes. The unsigned bytes are converted from 0-255 to 0.0-1.0 floats
-     * before the multiplication is done.
-     *
-     * The resulting value is normalized from 0.0-1.0 to a 0-255 value and stored in the output.
-     * If the output vector size is less than four, the unused channels are discarded.
-     *
-     * If addVector is null, a vector of zeroes is added, i.e. a noop.
-     *
-     * Check kIdentityMatrix, kGreyScaleColorMatrix, kRgbToYuvMatrix, and kYuvToRgbMatrix for sample
-     * matrices. The YUV conversion may not work for all color spaces.
-     *
-     * @param in The buffer of the image to be converted.
-     * @param out The buffer that receives the converted image.
-     * @param inputVectorSize The number of bytes in each input cell, a value from 1 to 4.
-     * @param outputVectorSize The number of bytes in each output cell, a value from 1 to 4.
-     * @param sizeX The width of both buffers, as a number of 1 to 4 byte cells.
-     * @param sizeY The height of both buffers, as a number of 1 to 4 byte cells.
-     * @param matrix The 4x4 matrix to multiply, in row major format.
-     * @param addVector A vector of four floats that's added to the result of the multiplication.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     */
-    void colorMatrix(const void* _Nonnull in, void* _Nonnull out, size_t inputVectorSize,
-                     size_t outputVectorSize, size_t sizeX, size_t sizeY,
-                     const float* _Nonnull matrix, const float* _Nullable addVector = nullptr,
-                     const Restriction* _Nullable restriction = nullptr);
-
-    /**
-     * Convolve a ByteArray.
-     *
-     * Applies a 3x3 or 5x5 convolution to the input array using the provided coefficients.
-     *
-     * For 3x3 convolutions, 9 coefficients must be provided. For 5x5, 25 coefficients are needed.
-     * The coefficients should be provided in row-major format.
-     *
-     * When the square extends past the edge, the edge values will be used as replacement for the
-     * values that's are off boundary.
-     *
-     * Each input cell can either be represented by one to four bytes. Each byte is multiplied
-     * and accumulated independently of the other bytes of the cell.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY.
-     *
-     * The input and output buffers must have the same dimensions. Both buffers should be
-     * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
-     *
-     * @param in The buffer of the image to be blurred.
-     * @param out The buffer that receives the blurred image.
-     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
-     * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
-     * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
-     * @param coefficients 9 or 25 multipliers.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     */
-    void convolve3x3(const void* _Nonnull in, void* _Nonnull out, size_t vectorSize, size_t sizeX,
-                     size_t sizeY, const float* _Nonnull coefficients,
-                     const Restriction* _Nullable restriction = nullptr);
-
-    void convolve5x5(const void* _Nonnull in, void* _Nonnull out, size_t vectorSize, size_t sizeX,
-                     size_t sizeY, const float* _Nonnull coefficients,
-                     const Restriction* _Nullable restriction = nullptr);
-
-    /**
-     * Compute the histogram of an image.
-     *
-     * Tallies how many times each of the 256 possible values of a byte is found in the input.
-     *
-     * An input cell can be represented by one to four bytes. The tally is done independently
-     * for each of the bytes of the cell. Correspondingly, the out array will have
-     * 256 * vectorSize entries. The counts for value 0 are consecutive, followed by those for
-     * value 1, etc.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY.
-     *
-     * The source buffers should be large enough for sizeX * sizeY * vectorSize bytes. The buffers
-     * have a row-major layout. The out buffer should be large enough for 256 * vectorSize ints.
-     *
-     * @param in The buffer of the image to be analyzed.
-     * @param out The resulting vector of counts.
-     * @param sizeX The width of the input buffers, as a number of 1 or 4 byte cells.
-     * @param sizeY The height of the input buffers, as a number of 1 or 4 byte cells.
-     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     */
-    void histogram(const uint8_t* _Nonnull in, int32_t* _Nonnull out, size_t sizeX, size_t sizeY,
-                   size_t vectorSize, const Restriction* _Nullable restriction = nullptr);
-
-    /**
-     * Compute the histogram of the dot product of an image.
-     *
-     * This method supports cells of 1 to 4 bytes in length. For each cell of the array,
-     * the dot product of its bytes with the provided coefficients is computed. The resulting
-     * floating point value is converted to an unsigned byte and tallied in the histogram.
-     *
-     * If coefficients is null, the coefficients used for RGBA luminosity calculation will be used,
-     * i.e. the values [0.299f, 0.587f, 0.114f, 0.f].
-     *
-     * Each coefficients must be >= 0 and their sum must be 1.0 or less. There must be the same
-     * number of coefficients as vectorSize.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY.
-     *
-     * The source buffers should be large enough for sizeX * sizeY * vectorSize bytes. The buffers
-     * have a row-major layout. The out array should be large enough for 256 ints.
-     *
-     * @param in The buffer of the image to be analyzed.
-     * @param out The resulting vector of counts.
-     * @param sizeX The width of the input buffers, as a number of 1 or 4 byte cells.
-     * @param sizeY The height of the input buffers, as a number of 1 or 4 byte cells.
-     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
-     * @param coefficients The values used for the dot product. Can be nullptr.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     */
-    void histogramDot(const uint8_t* _Nonnull in, int32_t* _Nonnull out, size_t sizeX, size_t sizeY,
-                      size_t vectorSize, const float* _Nullable coefficients,
-                      const Restriction* _Nullable restriction = nullptr);
-
-    /**
-     * Transform an image using a look up table
-     *
-     * Transforms an image by using a per-channel lookup table. Each channel of the input has an
-     * independent lookup table. The tables are 256 entries in size and can cover the full value
-     * range of a byte.
-     *
-     * The input array should be in RGBA format, where four consecutive bytes form an cell.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY.
-     *
-     * The input and output buffers must have the same dimensions. Both buffers should be
-     * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
-     *
-     * @param in The buffer of the image to be transformed.
-     * @param out The buffer that receives the transformed image.
-     * @param sizeX The width of both buffers, as a number of 4 byte cells.
-     * @param sizeY The height of both buffers, as a number of 4 byte cells.
-     * @param red An array of 256 values that's used to convert the R channel.
-     * @param green An array of 256 values that's used to convert the G channel.
-     * @param blue An array of 256 values that's used to convert the B channel.
-     * @param alpha An array of 256 values that's used to convert the A channel.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     */
-    void lut(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
-             const uint8_t* _Nonnull red, const uint8_t* _Nonnull green,
-             const uint8_t* _Nonnull blue, const uint8_t* _Nonnull alpha,
-             const Restriction* _Nullable restriction = nullptr);
-
-    /**
-     * Transform an image using a 3D look up table
-     *
-     * Transforms an image, converting RGB to RGBA by using a 3D lookup table. The incoming R, G,
-     * and B values are normalized to the dimensions of the provided 3D buffer. The eight nearest
-     * values in that 3D buffer are sampled and linearly interpolated. The resulting RGBA entry
-     * is stored in the output.
-     *
-     * The input array should be in RGBA format, where four consecutive bytes form an cell.
-     * The fourth byte of each input cell is ignored.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY.
-     *
-     * The input and output buffers must have the same dimensions. Both buffers should be
-     * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
-     *
-     * @param in The buffer of the image to be transformed.
-     * @param out The buffer that receives the transformed image.
-     * @param sizeX The width of both buffers, as a number of 4 byte cells.
-     * @param sizeY The height of both buffers, as a number of 4 byte cells.
-     * @param cube The translation cube, in row major-format.
-     * @param cubeSizeX The number of RGBA entries in the cube in the X direction.
-     * @param cubeSizeY The number of RGBA entries in the cube in the Y direction.
-     * @param cubeSizeZ The number of RGBA entries in the cube in the Z direction.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     */
-    void lut3d(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
-               const uint8_t* _Nonnull cube, size_t cubeSizeX, size_t cubeSizeY, size_t cubeSizeZ,
-               const Restriction* _Nullable restriction = nullptr);
-
-    /**
-     * Resize an image.
-     *
-     * Resizes an image using bicubic interpolation.
-     *
-     * This method supports cells of 1 to 4 bytes in length. Each byte of the cell is
-     * interpolated independently from the others.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of the output buffer. The corresponding scaled range of the input will be used.  If provided,
-     * the range must be wholly contained with the dimensions described by outputSizeX and
-     * outputSizeY.
-     *
-     * The input and output buffers have a row-major layout. Both buffers should be
-     * large enough for sizeX * sizeY * vectorSize bytes.
-     *
-     * @param in The buffer of the image to be resized.
-     * @param out The buffer that receives the resized image.
-     * @param inputSizeX The width of the input buffer, as a number of 1-4 byte cells.
-     * @param inputSizeY The height of the input buffer, as a number of 1-4 byte cells.
-     * @param vectorSize The number of bytes in each cell of both buffers. A value from 1 to 4.
-     * @param outputSizeX The width of the output buffer, as a number of 1-4 byte cells.
-     * @param outputSizeY The height of the output buffer, as a number of 1-4 byte cells.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     */
-    void resize(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t inputSizeX,
-                size_t inputSizeY, size_t vectorSize, size_t outputSizeX, size_t outputSizeY,
-                const Restriction* _Nullable restriction = nullptr);
-
-    /**
-     * The YUV formats supported by yuvToRgb.
-     */
-    enum class YuvFormat {
-        NV21 = 0x11,
-        YV12 = 0x32315659,
-    };
-
-    /**
-     * Convert an image from YUV to RGB.
-     *
-     * Converts an Android YUV buffer to RGB. The input allocation should be
-     * supplied in a supported YUV format as a YUV cell Allocation.
-     * The output is RGBA; the alpha channel will be set to 255.
-     *
-     * Note that for YV12 and a sizeX that's not a multiple of 32, the
-     * RenderScript Intrinsic may not have converted the image correctly.
-     * This Toolkit method should.
-     *
-     * @param in The buffer of the image to be converted.
-     * @param out The buffer that receives the converted image.
-     * @param sizeX The width in pixels of the image. Must be even.
-     * @param sizeY The height in pixels of the image.
-     * @param format Either YV12 or NV21.
-     */
-    void yuvToRgb(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
-                  YuvFormat format);
-};
-
-}  // namespace renderscript
-}  // namespace android
-
-#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_TOOLKIT_H

diff --git a/toolkit/Resize.cpp b/toolkit/Resize.cpp
deleted file mode 100644
index 624ae8e..0000000
--- a/toolkit/Resize.cpp
+++ /dev/null

@@ -1,769 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <math.h>
-
-#include <cstdint>
-
-#include "RenderScriptToolkit.h"
-#include "TaskProcessor.h"
-#include "Utils.h"
-
-#if defined(ARCH_X86_HAVE_AVX2)
-#include <stdint.h>
-#include <x86intrin.h>
-#include <xmmintrin.h>
-#endif
-
-#define LOG_TAG "renderscript.toolkit.Resize"
-
-namespace android {
-namespace renderscript {
-
-class ResizeTask : public Task {
-    const uchar* mIn;
-    uchar* mOut;
-    float mScaleX;
-    float mScaleY;
-    size_t mInputSizeX;
-    size_t mInputSizeY;
-
-    void kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
-    void kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
-    void kernelU4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-    void kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
-    void kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
-    void kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
-#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-
-    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
-    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                             size_t endY) override;
-
-   public:
-    ResizeTask(const uchar* input, uchar* output, size_t inputSizeX, size_t inputSizeY,
-               size_t vectorSize, size_t outputSizeX, size_t outputSizeY,
-               const Restriction* restriction)
-        : Task{outputSizeX, outputSizeY, vectorSize, false, restriction},
-          mIn{input},
-          mOut{output},
-          mInputSizeX{inputSizeX},
-          mInputSizeY{inputSizeY} {
-        mScaleX = static_cast<float>(inputSizeX) / outputSizeX;
-        mScaleY = static_cast<float>(inputSizeY) / outputSizeY;
-    }
-};
-
-void ResizeTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
-                             size_t endY) {
-    typedef void (ResizeTask::*KernelFunction)(uchar*, uint32_t, uint32_t, uint32_t);
-
-    KernelFunction kernel;
-    switch (mVectorSize) {
-        case 4:
-            kernel = &ResizeTask::kernelU4;
-            break;
-        case 3:
-            kernel = &ResizeTask::kernelU4;
-            break;
-        case 2:
-            kernel = &ResizeTask::kernelU2;
-            break;
-        case 1:
-            kernel = &ResizeTask::kernelU1;
-            break;
-        default:
-            ALOGE("Bad vector size %zd", mVectorSize);
-    }
-
-    for (size_t y = startY; y < endY; y++) {
-        size_t offset = (mSizeX * y + startX) * paddedSize(mVectorSize);
-        uchar* out = mOut + offset;
-        std::invoke(kernel, this, out, startX, endX, y);
-    }
-}
-
-static float4 cubicInterpolate(float4 p0, float4 p1, float4 p2, float4 p3, float x) {
-    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
-            + x * (3.f * (p1 - p2) + p3 - p0)));
-}
-
-static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) {
-    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
-            + x * (3.f * (p1 - p2) + p3 - p0)));
-}
-
-
-#if defined(ARCH_X86_HAVE_AVX2)
-static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
-   return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 +
-           _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(4.f), _mm_set1_ps(p2),_mm_set1_ps(p3)))
-           + x * (_mm_cvtss_f32(_mm_fmadd_ss (_mm_set1_ps(3.f),_mm_set1_ps(p1 - p2),
-                                              _mm_set1_ps(p3 - p0))))));
-
-}
-#else
-static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
-    //ALOGI("CP, %f, %f, %f, %f, %f", p0, p1, p2, p3, x);
-    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
-            + x * (3.f * (p1 - p2) + p3 - p0)));
-}
-#endif
-
-static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
-                         float xf, float yf, int width) {
-    int startx = (int) floor(xf - 1);
-    xf = xf - floor(xf);
-    int maxx = width - 1;
-    int xs0 = std::max(0, startx + 0);
-    int xs1 = std::max(0, startx + 1);
-    int xs2 = std::min(maxx, startx + 2);
-    int xs3 = std::min(maxx, startx + 3);
-
-    float4 p0  = cubicInterpolate(convert<float4>(yp0[xs0]),
-                                  convert<float4>(yp0[xs1]),
-                                  convert<float4>(yp0[xs2]),
-                                  convert<float4>(yp0[xs3]), xf);
-
-    float4 p1  = cubicInterpolate(convert<float4>(yp1[xs0]),
-                                  convert<float4>(yp1[xs1]),
-                                  convert<float4>(yp1[xs2]),
-                                  convert<float4>(yp1[xs3]), xf);
-
-    float4 p2  = cubicInterpolate(convert<float4>(yp2[xs0]),
-                                  convert<float4>(yp2[xs1]),
-                                  convert<float4>(yp2[xs2]),
-                                  convert<float4>(yp2[xs3]), xf);
-
-    float4 p3  = cubicInterpolate(convert<float4>(yp3[xs0]),
-                                  convert<float4>(yp3[xs1]),
-                                  convert<float4>(yp3[xs2]),
-                                  convert<float4>(yp3[xs3]), xf);
-
-    float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
-    p = clamp(p + 0.5f, 0.f, 255.f);
-    return convert<uchar4>(p);
-}
-
-static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
-                         float xf, float yf, int width) {
-    int startx = (int) floor(xf - 1);
-    xf = xf - floor(xf);
-    int maxx = width - 1;
-    int xs0 = std::max(0, startx + 0);
-    int xs1 = std::max(0, startx + 1);
-    int xs2 = std::min(maxx, startx + 2);
-    int xs3 = std::min(maxx, startx + 3);
-
-    float2 p0  = cubicInterpolate(convert<float2>(yp0[xs0]),
-                                  convert<float2>(yp0[xs1]),
-                                  convert<float2>(yp0[xs2]),
-                                  convert<float2>(yp0[xs3]), xf);
-
-    float2 p1  = cubicInterpolate(convert<float2>(yp1[xs0]),
-                                  convert<float2>(yp1[xs1]),
-                                  convert<float2>(yp1[xs2]),
-                                  convert<float2>(yp1[xs3]), xf);
-
-    float2 p2  = cubicInterpolate(convert<float2>(yp2[xs0]),
-                                  convert<float2>(yp2[xs1]),
-                                  convert<float2>(yp2[xs2]),
-                                  convert<float2>(yp2[xs3]), xf);
-
-    float2 p3  = cubicInterpolate(convert<float2>(yp3[xs0]),
-                                  convert<float2>(yp3[xs1]),
-                                  convert<float2>(yp3[xs2]),
-                                  convert<float2>(yp3[xs3]), xf);
-
-    float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
-    p = clamp(p + 0.5f, 0.f, 255.f);
-    return convert<uchar2>(p);
-}
-
-static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
-                        float xf, float yf, int width) {
-    int startx = (int) floor(xf - 1);
-    xf = xf - floor(xf);
-    int maxx = width - 1;
-    int xs0 = std::max(0, startx + 0);
-    int xs1 = std::max(0, startx + 1);
-    int xs2 = std::min(maxx, startx + 2);
-    int xs3 = std::min(maxx, startx + 3);
-
-    float p0  = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1],
-                                 (float)yp0[xs2], (float)yp0[xs3], xf);
-    float p1  = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1],
-                                 (float)yp1[xs2], (float)yp1[xs3], xf);
-    float p2  = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1],
-                                 (float)yp2[xs2], (float)yp2[xs3], xf);
-    float p3  = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1],
-                                 (float)yp3[xs2], (float)yp3[xs3], xf);
-
-    float p  = cubicInterpolate(p0, p1, p2, p3, yf);
-    p = clamp(p + 0.5f, 0.f, 255.f);
-    //ALOGI("CUC,%f,%u", p, (uchar)p);
-    return (uchar)p;
-}
-
-extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc);
-
-extern "C" void rsdIntrinsicResizeB4_K(
-            uchar4 *dst,
-            size_t count,
-            uint32_t xf,
-            uint32_t xinc,
-            uchar4 const *srcn,
-            uchar4 const *src0,
-            uchar4 const *src1,
-            uchar4 const *src2,
-            size_t xclip,
-            size_t avail,
-            uint64_t osc_ctl,
-            int32_t const *yr);
-
-extern "C" void rsdIntrinsicResizeB2_K(
-            uchar2 *dst,
-            size_t count,
-            uint32_t xf,
-            uint32_t xinc,
-            uchar2 const *srcn,
-            uchar2 const *src0,
-            uchar2 const *src1,
-            uchar2 const *src2,
-            size_t xclip,
-            size_t avail,
-            uint64_t osc_ctl,
-            int32_t const *yr);
-
-extern "C" void rsdIntrinsicResizeB1_K(
-            uchar *dst,
-            size_t count,
-            uint32_t xf,
-            uint32_t xinc,
-            uchar const *srcn,
-            uchar const *src0,
-            uchar const *src1,
-            uchar const *src2,
-            size_t xclip,
-            size_t avail,
-            uint64_t osc_ctl,
-            int32_t const *yr);
-
-#if defined(ARCH_ARM_USE_INTRINSICS)
-static void mkYCoeff(int32_t *yr, float yf) {
-    int32_t yf1 = rint(yf * 0x10000);
-    int32_t yf2 = rint(yf * yf * 0x10000);
-    int32_t yf3 = rint(yf * yf * yf * 0x10000);
-
-    yr[0] = -(2 * yf2 - yf3 - yf1) >> 1;
-    yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1;
-    yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1;
-    yr[3] = -(yf3 - yf2) >> 1;
-}
-#endif
-
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3,
-                         float xf, float yf, int width) {
-    int startx = (int) floor(xf - 1);
-    xf = xf - floor(xf);
-    int maxx = width - 1;
-    int xs0 = std::max(0, startx + 0);
-    int xs1 = std::max(0, startx + 1);
-    int xs2 = std::min(maxx, startx + 2);
-    int xs3 = std::min(maxx, startx + 3);
-
-    float4 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
-                                  yp0[xs2], yp0[xs3], xf);
-    float4 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
-                                  yp1[xs2], yp1[xs3], xf);
-    float4 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
-                                  yp2[xs2], yp2[xs3], xf);
-    float4 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
-                                  yp3[xs2], yp3[xs3], xf);
-
-    float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
-    return p;
-}
-
-static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3,
-                         float xf, float yf, int width) {
-    int startx = (int) floor(xf - 1);
-    xf = xf - floor(xf);
-    int maxx = width - 1;
-    int xs0 = std::max(0, startx + 0);
-    int xs1 = std::max(0, startx + 1);
-    int xs2 = std::min(maxx, startx + 2);
-    int xs3 = std::min(maxx, startx + 3);
-
-    float2 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
-                                  yp0[xs2], yp0[xs3], xf);
-    float2 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
-                                  yp1[xs2], yp1[xs3], xf);
-    float2 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
-                                  yp2[xs2], yp2[xs3], xf);
-    float2 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
-                                  yp3[xs2], yp3[xs3], xf);
-
-    float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
-    return p;
-}
-
-static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3,
-                        float xf, float yf, int width) {
-    int startx = (int) floor(xf - 1);
-    xf = xf - floor(xf);
-    int maxx = width - 1;
-    int xs0 = std::max(0, startx + 0);
-    int xs1 = std::max(0, startx + 1);
-    int xs2 = std::min(maxx, startx + 2);
-    int xs3 = std::min(maxx, startx + 3);
-
-    float p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
-                                 yp0[xs2], yp0[xs3], xf);
-    float p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
-                                 yp1[xs2], yp1[xs3], xf);
-    float p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
-                                 yp2[xs2], yp2[xs3], xf);
-    float p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
-                                 yp3[xs2], yp3[xs3], xf);
-
-    float p  = cubicInterpolate(p0, p1, p2, p3, yf);
-    return p;
-}
-#endif
-
-void ResizeTask::kernelU4(uchar *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
-    const uchar *pin = mIn;
-    const int srcHeight = mInputSizeY;
-    const int srcWidth = mInputSizeX;
-    const size_t stride = mInputSizeX * paddedSize(mVectorSize);
-
-
-#if defined(ARCH_X86_HAVE_AVX2)
-    float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
-                                          _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
-#else
-    float yf = (currentY + 0.5f) * mScaleY - 0.5f;
-#endif
-
-
-    int starty = (int) floor(yf - 1);
-    yf = yf - floor(yf);
-    int maxy = srcHeight - 1;
-    int ys0 = std::max(0, starty + 0);
-    int ys1 = std::max(0, starty + 1);
-    int ys2 = std::min(maxy, starty + 2);
-    int ys3 = std::min(maxy, starty + 3);
-
-    const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0);
-    const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1);
-    const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
-    const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
-
-    uchar4 *out = ((uchar4 *)outPtr);
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-#if defined(ARCH_ARM_USE_INTRINSICS)
-    if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
-        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
-        long xf16 = rint(xf * 0x10000);
-        uint32_t xinc16 = rint(mScaleX * 0x10000);
-
-        int xoff = (xf16 >> 16) - 1;
-        int xclip = std::max(0, xoff) - xoff;
-        int len = x2 - x1;
-
-        int32_t yr[4];
-        uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
-        mkYCoeff(yr, yf);
-
-        xoff += xclip;
-
-        rsdIntrinsicResizeB4_K(
-                out, len,
-                xf16 & 0xffff, xinc16,
-                yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
-                xclip, srcWidth - xoff + xclip,
-                osc_ctl, yr);
-        out += len;
-        x1 += len;
-    }
-#endif
-
-    while(x1 < x2) {
-#if defined(ARCH_X86_HAVE_AVX2)
-        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
-                                              _mm_set1_ps(0.5f)));
-#else
-        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
-#endif
-        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
-        out++;
-        x1++;
-    }
-}
-
-void ResizeTask::kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
-    const uchar *pin = mIn;
-    const int srcHeight = mInputSizeY;
-    const int srcWidth = mInputSizeX;
-    const size_t stride = mInputSizeX * mVectorSize;
-
-
-#if defined(ARCH_X86_HAVE_AVX2)
-    float yf = _mm_cvtss_f32(
-            _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
-#else
-    float yf = (currentY + 0.5f) * mScaleY - 0.5f;
-#endif
-
-    int starty = (int) floor(yf - 1);
-    yf = yf - floor(yf);
-    int maxy = srcHeight - 1;
-    int ys0 = std::max(0, starty + 0);
-    int ys1 = std::max(0, starty + 1);
-    int ys2 = std::min(maxy, starty + 2);
-    int ys3 = std::min(maxy, starty + 3);
-
-    const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0);
-    const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1);
-    const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
-    const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
-
-    uchar2 *out = ((uchar2 *)outPtr);
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-#if defined(ARCH_ARM_USE_INTRINSICS)
-    if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
-        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
-        long xf16 = rint(xf * 0x10000);
-        uint32_t xinc16 = rint(mScaleX * 0x10000);
-
-        int xoff = (xf16 >> 16) - 1;
-        int xclip = std::max(0, xoff) - xoff;
-        int len = x2 - x1;
-
-        int32_t yr[4];
-        uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
-        mkYCoeff(yr, yf);
-
-        xoff += xclip;
-
-        rsdIntrinsicResizeB2_K(
-                out, len,
-                xf16 & 0xffff, xinc16,
-                yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
-                xclip, srcWidth - xoff + xclip,
-                osc_ctl, yr);
-        out += len;
-        x1 += len;
-    }
-#endif
-
-    while(x1 < x2) {
-
-#if defined(ARCH_X86_HAVE_AVX2)
-        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
-                                              _mm_set1_ps(0.5f)));
-#else
-        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
-#endif
-        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
-        out++;
-        x1++;
-    }
-}
-
-void ResizeTask::kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
-    //ALOGI("TK kernelU1 xstart %u, xend %u, outstep %u", xstart, xend);
-    const uchar *pin = mIn;
-    const int srcHeight = mInputSizeY;
-    const int srcWidth = mInputSizeX;
-    const size_t stride = mInputSizeX * mVectorSize;
-
-    // ALOGI("Toolkit   ResizeU1 (%ux%u) by (%f,%f), xstart:%u to %u, stride %zu, out %p", srcWidth,
-    // srcHeight, scaleX, scaleY, xstart, xend, stride, outPtr);
-
-#if defined(ARCH_X86_HAVE_AVX2)
-    float yf = _mm_cvtss_f32(
-            _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
-#else
-    float yf = (currentY + 0.5f) * mScaleY - 0.5f;
-#endif
-
-    int starty = (int) floor(yf - 1);
-    yf = yf - floor(yf);
-    int maxy = srcHeight - 1;
-    int ys0 = std::max(0, starty + 0);
-    int ys1 = std::min(maxy, std::max(0, starty + 1));
-    int ys2 = std::min(maxy, starty + 2);
-    int ys3 = std::min(maxy, starty + 3);
-
-    const uchar *yp0 = pin + stride * ys0;
-    const uchar *yp1 = pin + stride * ys1;
-    const uchar *yp2 = pin + stride * ys2;
-    const uchar *yp3 = pin + stride * ys3;
-
-    uchar *out = ((uchar *)outPtr);
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-#if defined(ARCH_ARM_USE_INTRINSICS)
-    if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
-        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
-        long xf16 = rint(xf * 0x10000);
-        uint32_t xinc16 = rint(mScaleX * 0x10000);
-
-        int xoff = (xf16 >> 16) - 1;
-        int xclip = std::max(0, xoff) - xoff;
-        int len = x2 - x1;
-
-        int32_t yr[4];
-        uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
-        mkYCoeff(yr, yf);
-
-        // ALOGI("ys0 %d, ys1 %d, ys2 %d, ys3 %d, x1 %u, x2 %u, xf %f, xf16 %ld, xinc16 %u, xoff %d,
-        // xclip %d, len %d, osc_ctl %lu)",
-        //       ys0, ys1, ys2, ys3, x1, x2, xf, xf16, xinc16, xoff, xclip, len, (unsigned long)
-        //       osc_ctl);
-        // ALOGI("TK scaleX %f, xf %f, xf16 %ld, xinc16 %d, xoff %d, xclip %d, len %d", scaleX, xf,
-        // xf16, xinc16, xoff, xclip, len); ALOGI("TK xf16 & 0xffff %ld, ys0 %u, ys1 %u, ys2 %u, ys3
-        // %u, srcWidth - xoff + xclip %d", xf16 & 0xffff, ys0, ys1, ys2, ys3, srcWidth - xoff);
-
-        xoff += xclip;
-
-        rsdIntrinsicResizeB1_K(
-                out, len,
-                xf16 & 0xffff, xinc16,
-                yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
-                xclip, srcWidth - xoff + xclip,
-                osc_ctl, yr);
-        out += len;
-        x1 += len;
-    }
-#endif
-
-    while(x1 < x2) {
-
-#if defined(ARCH_X86_HAVE_AVX2)
-        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
-                                              _mm_set1_ps(0.5f)));
-#else
-        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
-#endif
-
-        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
-        out++;
-        x1++;
-    }
-}
-
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-void ResizeTask::kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
-    const uchar *pin = mIn;
-    const int srcHeight = inputSizeY;
-    const int srcWidth = inputSizeX;
-    const size_t stride = sizeX * vectorSize;
-
-#if defined(ARCH_X86_HAVE_AVX2)
-    float yf = _mm_cvtss_f32(
-            _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
-#else
-    float yf = (currentY + 0.5f) * scaleY - 0.5f;
-#endif
-
-    int starty = (int) floor(yf - 1);
-    yf = yf - floor(yf);
-    int maxy = srcHeight - 1;
-    int ys0 = std::max(0, starty + 0);
-    int ys1 = std::max(0, starty + 1);
-    int ys2 = std::min(maxy, starty + 2);
-    int ys3 = std::min(maxy, starty + 3);
-
-    const float4 *yp0 = (const float4 *)(pin + stride * ys0);
-    const float4 *yp1 = (const float4 *)(pin + stride * ys1);
-    const float4 *yp2 = (const float4 *)(pin + stride * ys2);
-    const float4 *yp3 = (const float4 *)(pin + stride * ys3);
-
-    float4 *out = ((float4 *)outPtr);
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-    while(x1 < x2) {
-
-#if defined(ARCH_X86_HAVE_AVX2)
-        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
-                                              _mm_set1_ps(0.5f)));
-#else
-        float xf = (x1 + 0.5f) * scaleX - 0.5f;
-#endif
-
-        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
-        out++;
-        x1++;
-    }
-}
-
-void ResizeTask::kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
-    const uchar *pin = mIn;
-    const int srcHeight = inputSizeY;
-    const int srcWidth = inputSizeX;
-    const size_t stride = sizeX * vectorSize;
-
-
-#if defined(ARCH_X86_HAVE_AVX2)
-    float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
-                                          _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
-#else
-    float yf = (currentY + 0.5f) * scaleY - 0.5f;
-#endif
-
-    int starty = (int) floor(yf - 1);
-    yf = yf - floor(yf);
-    int maxy = srcHeight - 1;
-    int ys0 = std::max(0, starty + 0);
-    int ys1 = std::max(0, starty + 1);
-    int ys2 = std::min(maxy, starty + 2);
-    int ys3 = std::min(maxy, starty + 3);
-
-    const float2 *yp0 = (const float2 *)(pin + stride * ys0);
-    const float2 *yp1 = (const float2 *)(pin + stride * ys1);
-    const float2 *yp2 = (const float2 *)(pin + stride * ys2);
-    const float2 *yp3 = (const float2 *)(pin + stride * ys3);
-
-    float2 *out = ((float2 *)outPtr);
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-    while(x1 < x2) {
-
-#if defined(ARCH_X86_HAVE_AVX2)
-        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
-                                              _mm_set1_ps(0.5f)));
-#else
-        float xf = (x1 + 0.5f) * scaleX - 0.5f;
-#endif
-
-        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
-        out++;
-        x1++;
-    }
-}
-
-void ResizeTask::kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
-    const uchar *pin = mIn;
-    const int srcHeight = inputSizeY;
-    const int srcWidth = inputSizeX;
-    const size_t stride = sizeX * vectorSize;
-
-
-#if defined(ARCH_X86_HAVE_AVX2)
-    float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
-                                          _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
-#else
-    float yf = (currentY + 0.5f) * scaleY - 0.5f;
-#endif
-
-    int starty = (int) floor(yf - 1);
-    yf = yf - floor(yf);
-    int maxy = srcHeight - 1;
-    int ys0 = std::max(0, starty + 0);
-    int ys1 = std::max(0, starty + 1);
-    int ys2 = std::min(maxy, starty + 2);
-    int ys3 = std::min(maxy, starty + 3);
-
-    const float *yp0 = (const float *)(pin + stride * ys0);
-    const float *yp1 = (const float *)(pin + stride * ys1);
-    const float *yp2 = (const float *)(pin + stride * ys2);
-    const float *yp3 = (const float *)(pin + stride * ys3);
-
-    float *out = ((float *)outPtr);
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-    while(x1 < x2) {
-
-#if defined(ARCH_X86_HAVE_AVX2)
-        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
-                                              _mm_set1_ps(0.5f)));
-#else
-        float xf = (x1 + 0.5f) * scaleX - 0.5f;
-#endif
-
-        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
-        out++;
-        x1++;
-    }
-}
-
-void ResizeTask::preLaunch(uint32_t slot, const RsScriptCall *sc)
-{
-
-    //check the data type to determine F or U.
-    if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) {
-        switch(mAlloc->getType()->getElement()->getVectorSize()) {
-        case 1:
-            mRootPtr = &kernelU1;
-            break;
-        case 2:
-            mRootPtr = &kernelU2;
-            break;
-        case 3:
-        case 4:
-            mRootPtr = &kernelU4;
-            break;
-        }
-    } else {
-        switch(mAlloc->getType()->getElement()->getVectorSize()) {
-        case 1:
-            mRootPtr = &kernelF1;
-            break;
-        case 2:
-            mRootPtr = &kernelF2;
-            break;
-        case 3:
-        case 4:
-            mRootPtr = &kernelF4;
-            break;
-        }
-    }
-}
-#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-
-void RenderScriptToolkit::resize(const uint8_t* input, uint8_t* output, size_t inputSizeX,
-                                 size_t inputSizeY, size_t vectorSize, size_t outputSizeX,
-                                 size_t outputSizeY, const Restriction* restriction) {
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
-    if (!validRestriction(LOG_TAG, outputSizeX, outputSizeY, restriction)) {
-        return;
-    }
-    if (vectorSize < 1 || vectorSize > 4) {
-        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
-        return;
-    }
-#endif
-
-    ResizeTask task((const uchar*)input, (uchar*)output, inputSizeX, inputSizeY, vectorSize,
-                    outputSizeX, outputSizeY, restriction);
-    processor->doTask(&task);
-}
-
-}  // namespace renderscript
-}  // namespace android

diff --git a/toolkit/Resize_advsimd.S b/toolkit/Resize_advsimd.S
deleted file mode 100644
index 59e735c..0000000
--- a/toolkit/Resize_advsimd.S
+++ /dev/null

@@ -1,754 +0,0 @@
-/*
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
-#define END(f) .size f, .-f;
-
-/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
- * integer (bicubic has a little overshoot).  It would also be possible to add
- * a temporary DC bias to eliminate the sign bit for more precision, but that's
- * extra arithmetic.
- */
-.set VERTBITS, 14
-
-/* The size of the scratch buffer in which we store our vertically convolved
- * intermediates.
- */
-.set CHUNKSHIFT, 7       /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */
-.set CHUNKSIZE, (1 << CHUNKSHIFT)
-
-/* The number of components processed in a single iteration of the innermost
- * loop.
- */
-.set VECSHIFT, 3
-.set VECSIZE, (1<<VECSHIFT)
-
-/* Read four different lines (except at edges where addresses may be clamped,
- * which is why we don't simply take base and stride registers), and multiply
- * and accumulate them by the coefficients in v3[0..3], leaving the results in
- * v12.  This gives eight 16-bit results representing a horizontal line of 2-8
- * input pixels (depending on number of components per pixel) to be fed into
- * the horizontal scaling pass.
- *
- * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
- * known to represent negative values and VMLS is used to implement this).
- * Output is VERTBITS signed fixed-point, which must leave room for a little
- * v12.  This gives eight 16-bit results.
- */
-.macro vert8, dstlo=v12.4h, dsthi=v12.8h
-        ld1         {v8.8b}, [x4], #8
-        ld1         {v9.8b}, [x5], #8
-        ld1         {v10.8b}, [x6], #8
-        ld1         {v11.8b}, [x7], #8
-        uxtl        v8.8h, v8.8b
-        uxtl        v9.8h, v9.8b
-        uxtl        v10.8h, v10.8b
-        uxtl        v11.8h, v11.8b
-        umull       v12.4s, v9.4h, v3.h[1]
-        umull2      v13.4s, v9.8h, v3.h[1]
-        umlsl       v12.4s, v8.4h, v3.h[0]
-        umlsl2      v13.4s, v8.8h, v3.h[0]
-        umlal       v12.4s, v10.4h, v3.h[2]
-        umlal2      v13.4s, v10.8h, v3.h[2]
-        umlsl       v12.4s, v11.4h, v3.h[3]
-        umlsl2      v13.4s, v11.8h, v3.h[3]
-
-        /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
-         * minus VERTBITS (the number of fraction bits we want to keep from
-         * here on).
-         */
-        sqshrn      \dstlo, v12.4s, #8 + (16 - VERTBITS)
-        sqshrn2     \dsthi, v13.4s, #8 + (16 - VERTBITS)
-.endm
-
-/* As above, but only four 16-bit results into v12hi.
- */
-.macro vert4, dst=v12.8h
-        ld1         {v8.s}[0], [x4], #4
-        ld1         {v9.s}[0], [x5], #4
-        ld1         {v10.s}[0], [x6], #4
-        ld1         {v11.s}[0], [x7], #4
-        uxtl        v8.8h, v8.8b
-        uxtl        v9.8h, v9.8b
-        uxtl        v10.8h, v10.8b
-        uxtl        v11.8h, v11.8b
-        umull       v12.4s, v9.4h, v3.h[1]
-        umlsl       v12.4s, v8.4h, v3.h[0]
-        umlal       v12.4s, v10.4h, v3.h[2]
-        umlsl       v12.4s, v11.4h, v3.h[3]
-.ifc \dst,v12.8h
-        sqshrn2     \dst, v12.4s, #8 + (16 - VERTBITS)
-.else
-        sqshrn      \dst, v12.4s, #8 + (16 - VERTBITS)
-.endif
-.endm
-
-
-/* During horizontal resize having CHUNKSIZE input available means being able
- * to produce a varying amount of output, depending on the phase of the data.
- * This function calculates the minimum number of VECSIZE chunks extracted from
- * a CHUNKSIZE window (x1), and the threshold value for when the count will be
- * one higher than that (x0).
- * These work out, conveniently, to be the quotient and remainder from:
- *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
- *
- * The two values are packed together in a uint64_t for convenience; and
- * they are, in fact, used this way as an arithmetic short-cut later on.
- */
-/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */
-ENTRY(rsdIntrinsicResize_oscctl_K)
-        lsl         x2, x0, #VECSHIFT
-        mov         x0, #(CHUNKSIZE << 16) - 1
-        add         x0, x0, x2
-        udiv        x1, x0, x2
-        msub        x0, x1, x2, x0
-        add         x0, x0, x1, LSL #32
-        ret
-END(rsdIntrinsicResize_oscctl_K)
-
-/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
- * For the most part the vertical pass (the outer loop) is the same for all
- * versions.  Exceptions are handled in-line with conditional assembly.
- */
-.irp comp, 1, 2, 4
-.if \comp == 1
-.set COMPONENT_SHIFT, 0
-.elseif \comp == 2
-.set COMPONENT_SHIFT, 1
-.elseif \comp == 4
-.set COMPONENT_SHIFT, 2
-.else
-.error "Unknown component count"
-.endif
-.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
-.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
-
-.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
-
-/* void rsdIntrinsicResizeB1_K(
- *             uint8_t * restrict dst,          // x0
- *             size_t count,                    // x1
- *             uint32_t xf,                     // x2
- *             uint32_t xinc,                   // x3
- *             uint8_t const * restrict srcn,   // x4
- *             uint8_t const * restrict src0,   // x5
- *             uint8_t const * restrict src1,   // x6
- *             uint8_t const * restrict src2,   // x7
- *             size_t xclip,                    // [sp,#0]  -> [sp,#80] -> x12
- *             size_t avail,                    // [sp,#8]  -> [sp,#88] -> x11
- *             uint64_t osc_ctl,                // [sp,#16] -> [sp,#96] -> x10
- *             int32 const *yr,                 // [sp,#24] -> [sp,#104] -> v4   (copied to v3   for scalar access)
- */
-ENTRY(rsdIntrinsicResizeB\comp\()_K)
-            sub         x8, sp, #48
-            sub         sp, sp, #80
-            st1         {v8.1d - v11.1d}, [sp]
-            st1         {v12.1d - v15.1d}, [x8]
-            str         x19, [x8, #32]
-
-            /* align the working buffer on the stack to make it easy to use bit
-             * twiddling for address calculations.
-             */
-            sub         x12, sp, #BUFFER_SIZE
-            bic         x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1
-
-            ldr         x8, [sp,#104]           // yr
-            adrp        x9, intrinsic_resize_consts
-            add         x9, x9, :lo12:intrinsic_resize_consts
-            ld1         {v4.4s}, [x8]
-            ld1         {v5.8h}, [x9]
-            sqxtun      v4.4h, v4.4s            // yr
-            dup         v6.8h, w2
-            dup         v7.8h, w3
-            mla         v6.8h, v5.8h, v7.8h     // vxf
-            shl         v7.8h, v7.8h, #VECSHIFT // vxinc
-
-            /* Compute starting condition for oscillator used to compute ahead
-             * of time how many iterations are possible before needing to
-             * refill the working buffer.  This is based on the fixed-point
-             * index of the last element in the vector of pixels processed in
-             * each iteration, counting up until it would overflow.
-             */
-            sub         x8, x2, x3
-            lsl         x9, x3, #VECSHIFT
-            add         x8, x8, x9
-
-            ldr         x10, [sp,#96]           // osc_ctl
-            ldp         x13,x11, [sp,#80]       // xclip, avail
-
-            mov         x19, sp
-            mov         sp, x12
-
-            /* x4-x7 contain pointers to the four lines of input to be
-             * convolved.  These pointers have been clamped vertically and
-             * horizontally (which is why it's not a simple row/stride pair),
-             * and the xclip argument (now in x13) indicates how many pixels
-             * from true the x position of the pointer is.  This value should
-             * be 0, 1, or 2 only.
-             *
-             * Start by placing four pixels worth of input at the far end of
-             * the buffer.  As many as two of these may be clipped, so four
-             * pixels are fetched, and then the first pixel is duplicated and
-             * the data shifted according to xclip.  The source pointers are
-             * then also adjusted according to xclip so that subsequent fetches
-             * match.
-             */
-            mov         v3.8b, v4.8b  /* make y coeffs available for vert4 and vert8 macros */
-            sub         x14, x12, x13, LSL #(COMPONENT_SHIFT + 1)
-            add         x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
-            add         x14, x14, #4 * COMPONENT_COUNT * 2
-.if \comp == 1
-            vert4       v12.4h
-            dup         v11.4h, v12.h[0]
-            st1         {v11.4h,v12.4h}, [x12]
-            ld1         {v12.4h}, [x14]
-            st1         {v12.4h}, [x15]
-.elseif \comp == 2
-            vert8
-            dup         v11.4s, v12.s[0]
-            st1         {v11.8h,v12.8h}, [x12]
-            ld1         {v12.8h}, [x14]
-            st1         {v12.8h}, [x15]
-.elseif \comp == 4
-            vert8       v14.4h, v14.8h
-            vert8       v15.4h, v15.8h
-            dup         v12.2d, v14.d[0]
-            dup         v13.2d, v14.d[0]
-            st1         {v12.8h,v13.8h}, [x12], #32
-            st1         {v14.8h,v15.8h}, [x12]
-            sub         x12, x12, #32
-            ld1         {v11.8h,v12.8h}, [x14]
-            st1         {v11.8h,v12.8h}, [x15]
-.endif
-            /* Count off four pixels into the working buffer.
-             */
-            sub         x11, x11, #4
-            /* Incoming pointers were to the first _legal_ pixel.  Four pixels
-             * were read unconditionally, but some may have been discarded by
-             * xclip, so we rewind the pointers to compensate.
-             */
-            sub         x4, x4, x13, LSL #(COMPONENT_SHIFT)
-            sub         x5, x5, x13, LSL #(COMPONENT_SHIFT)
-            sub         x6, x6, x13, LSL #(COMPONENT_SHIFT)
-            sub         x7, x7, x13, LSL #(COMPONENT_SHIFT)
-
-            /* First tap starts where we just pre-filled, at the end of the
-             * buffer.
-             */
-            add         x2, x2, #(CHUNKSIZE * 2 - 4) << 16
-
-            /* Use overflowing arithmetic to implement wraparound array
-             * indexing.
-             */
-            lsl         x2, x2, #(47 - CHUNKSHIFT)
-            lsl         x3, x3, #(47 - CHUNKSHIFT)
-
-
-            /* Start of outermost loop.
-             * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
-             * number of iterations of the inner loop that can be performed and
-             * get into that.
-             *
-             * The fill is complicated by the possibility of running out of
-             * input before the scratch buffer is filled.  If this isn't a risk
-             * then it's handled by the simple loop at 2:, otherwise the
-             * horrible loop at 3:.
-             */
-1:          mov         v3.8b, v4.8b            /* put y scaling coefficients somewhere handy */
-            subs        x11, x11, #CHUNKSIZE
-            bge         2f                      /* if at least CHUNKSIZE are available... */
-            add         x11, x11, #CHUNKSIZE    /* if they're not... */
-            b           4f
-            /* basic fill loop, processing 8 bytes at a time until there are
-             * fewer than eight bytes available.
-             */
-3:          vert8
-            sub         x11, x11, #8 / COMPONENT_COUNT
-            st1         {v12.8h}, [x12], #16
-4:          cmp         x11, #8 / COMPONENT_COUNT - 1
-            bgt         3b
-.if \comp == 4
-            blt         3f
-            /* The last pixel (four bytes) if necessary */
-            vert4
-.else
-            cmp         x11, #1
-            blt         3f
-            /* The last pixels if necessary */
-            sub         x4, x4, #8
-            sub         x5, x5, #8
-            sub         x6, x6, #8
-            sub         x7, x7, #8
-            add         x4, x4, x11, LSL #(COMPONENT_SHIFT)
-            add         x5, x5, x11, LSL #(COMPONENT_SHIFT)
-            add         x6, x6, x11, LSL #(COMPONENT_SHIFT)
-            add         x7, x7, x11, LSL #(COMPONENT_SHIFT)
-            vert8
-            sub         x11, sp, x11, LSL #(COMPONENT_SHIFT + 1)
-            sub         sp, sp, #32
-            sub         x11, x11, #16
-.if \comp == 1
-            dup         v13.8h, v12.h[7]
-.elseif \comp == 2
-            dup         v13.4s, v12.s[3]
-.endif
-            st1         {v12.8h,v13.8h}, [sp]
-            ld1         {v12.8h}, [x11]
-            add         sp, sp, #32
-            b           4f
-.endif
-            /* Keep filling until we get to the end of this chunk of the buffer */
-3:
-.if \comp == 1
-            dup         v12.8h, v12.h[7]
-.elseif \comp == 2
-            dup         v12.4s, v12.s[3]
-.elseif \comp == 4
-            dup         v12.2d, v12.d[1]
-.endif
-4:          st1         {v12.8h}, [x12], #16
-            tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
-            bne         3b
-            b           4f
-
-.align 4
-2:          /* Quickly pull a chunk of data into the working buffer.
-             */
-            vert8
-            st1         {v12.8h}, [x12], #16
-            vert8
-            st1         {v12.8h}, [x12], #16
-            tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
-            bne         2b
-            cmp         x11, #0
-            bne         3f
-4:          /* if we end with 0 pixels left we'll have nothing handy to spread
-             * across to the right, so we rewind a bit.
-             */
-            mov         x11, #1
-            sub         x4, x4, #COMPONENT_COUNT
-            sub         x5, x5, #COMPONENT_COUNT
-            sub         x6, x6, #COMPONENT_COUNT
-            sub         x7, x7, #COMPONENT_COUNT
-3:          /* copy four taps (width of cubic window) to far end for overflow
-             * address handling
-             */
-            sub         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
-            eor         x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2
-.if \comp == 1
-            ld1         {v14.4h}, [x13]
-.elseif \comp == 2
-            ld1         {v14.8h}, [x13]
-.elseif \comp == 4
-            ld1         {v14.8h,v15.8h}, [x13]
-.endif
-            add         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
-.if \comp == 1
-            st1         {v14.4h}, [x13]
-.elseif \comp == 2
-            st1         {v14.8h}, [x13]
-.elseif \comp == 4
-            st1         {v14.8h,v15.8h}, [x13]
-.endif
-            /* The high 32-bits of x10 contains the maximum possible iteration
-             * count, but if x8 is greater than the low 32-bits of x10 then
-             * this indicates that the count must be reduced by one for this
-             * iteration to avoid reading past the end of the available data.
-             */
-            sub         x13, x10, x8
-            lsr         x13, x13, #32
-
-            madd        x8, x13, x9, x8
-            sub         x8, x8, #(CHUNKSIZE << 16)
-
-            /* prefer to count pixels, rather than vectors, to clarify the tail
-             * store case on exit.
-             */
-            lsl         x13, x13, #VECSHIFT
-            cmp         x13, x1
-            csel        x13, x1, x13, gt
-
-            sub         x1, x1, x13
-
-            lsl         x13, x13, #COMPONENT_SHIFT
-
-            mov         w14, #0x8000
-            movi        v30.8h, #3
-            dup         v31.8h, w14
-
-            cmp         x13, #0
-            bgt         3f
-            cmp         x1, #0
-            bgt         1b     /* an extreme case where we shouldn't use code in this structure */
-            b           9f
-
-            .align 4
-2:          /* Inner loop continues here, but starts at 3:, see end of loop
-             * below for explanation. */
-.if LOOP_OUTPUT_SIZE == 4
-            st1         {v8.s}[0], [x0], #4
-.elseif LOOP_OUTPUT_SIZE == 8
-            st1         {v8.8b}, [x0], #8
-.elseif LOOP_OUTPUT_SIZE == 16
-            st1         {v8.16b}, [x0], #16
-.elseif LOOP_OUTPUT_SIZE == 32
-            st1         {v8.16b,v9.16b}, [x0], #32
-.endif
-            /* Inner loop:  here the four x coefficients for each tap are
-             * calculated in vector code, and the addresses are calculated in
-             * scalar code, and these calculations are interleaved.
-             */
-3:          ushr        v8.8h, v6.8h, #1            // sxf
-            lsr         x14, x2, #(63 - CHUNKSHIFT)
-            sqrdmulh    v9.8h, v8.8h, v8.8h         // sxf**2
-            add         x2, x2, x3
-            sqrdmulh    v10.8h, v9.8h, v8.8h        // sxf**3
-            lsr         x15, x2, #(63 - CHUNKSHIFT)
-            sshll       v11.4s, v9.4h, #2
-            sshll2      v12.4s, v9.8h, #2
-            add         x2, x2, x3
-            smlsl       v11.4s, v10.4h, v30.4h
-            smlsl2      v12.4s, v10.8h, v30.8h
-            lsr         x16, x2, #(63 - CHUNKSHIFT)
-
-            shadd       v0.8h, v10.8h, v8.8h
-            add         x2, x2, x3
-            sub         v0.8h, v9.8h, v0.8h
-            lsr         x17, x2, #(63 - CHUNKSHIFT)
-
-            saddw       v1.4s, v11.4s, v9.4h
-            saddw2      v13.4s, v12.4s, v9.8h
-            add         x2, x2, x3
-            shrn        v1.4h, v1.4s, #1
-            shrn2       v1.8h, v13.4s, #1
-            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
-            sub         v1.8h, v1.8h, v31.8h
-            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
-
-            saddw       v2.4s, v11.4s, v8.4h
-            saddw2      v13.4s, v12.4s, v8.8h
-            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
-            shrn        v2.4h, v2.4s, #1
-            shrn2       v2.8h, v13.4s, #1
-            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
-            neg         v2.8h, v2.8h
-
-            shsub       v3.8h, v10.8h, v9.8h
-
-            /* increment the x fractional parts (oveflow is ignored, as the
-             * scalar arithmetic shadows this addition with full precision).
-             */
-            add         v6.8h, v6.8h, v7.8h
-
-            /* At this point we have four pointers in x8-x11, pointing to the
-             * four taps in the scratch buffer that must be convolved together
-             * to produce an output pixel (one output pixel per pointer).
-             * These pointers usually overlap, but their spacing is irregular
-             * so resolving the redundancy through L1 is a pragmatic solution.
-             *
-             * The scratch buffer is made of signed 16-bit data, holding over
-             * some extra precision, and overshoot, from the vertical pass.
-             *
-             * We also have the 16-bit unsigned fixed-point weights for each
-             * of the four taps in v0 - v3.  That's eight pixels worth of
-             * coefficients when we have only four pointers, so calculations
-             * for four more pixels are interleaved with the fetch and permute
-             * code for each variant in the following code.
-             *
-             * The data arrangement is less than ideal for any pixel format,
-             * but permuting loads help to mitigate most of the problems.
-             *
-             * Note also that the two outside taps of a bicubic are negative,
-             * but these coefficients are unsigned.  The sign is hard-coded by
-             * use of multiply-and-subtract operations.
-             */
-.if \comp == 1
-            /* The uchar 1 case.
-             * Issue one lanewise ld4.h to load four consecutive pixels from
-             * one pointer (one pixel) into four different registers; then load
-             * four consecutive s16 values from the next pointer (pixel) into
-             * the next lane of those four registers, etc., so that we finish
-             * with v12 - v15 representing the four taps, and each lane
-             * representing a separate pixel.
-             *
-             * The first ld4 uses a splat to avoid any false dependency on
-             * the previous state of the register.
-             */
-            ld4r        {v12.8h,v13.8h,v14.8h,v15.8h}, [x14]
-            lsr         x14, x2, #(63 - CHUNKSHIFT)
-            add         x2, x2, x3
-            ld4         {v12.h,v13.h,v14.h,v15.h}[1], [x15]
-            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
-            lsr         x15, x2, #(63 - CHUNKSHIFT)
-            add         x2, x2, x3
-            ld4         {v12.h,v13.h,v14.h,v15.h}[2], [x16]
-            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
-            lsr         x16, x2, #(63 - CHUNKSHIFT)
-            add         x2, x2, x3
-            ld4         {v12.h,v13.h,v14.h,v15.h}[3], [x17]
-            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
-            lsr         x17, x2, #(63 - CHUNKSHIFT)
-            add         x2, x2, x3
-            ld4         {v12.h,v13.h,v14.h,v15.h}[4], [x14]
-            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
-            ld4         {v12.h,v13.h,v14.h,v15.h}[5], [x15]
-            ld4         {v12.h,v13.h,v14.h,v15.h}[6], [x16]
-            ld4         {v12.h,v13.h,v14.h,v15.h}[7], [x17]
-
-            smull       v8.4s, v12.4h, v0.4h
-            smull2      v9.4s, v12.8h, v0.8h
-            smlsl       v8.4s, v13.4h, v1.4h
-            smlsl2      v9.4s, v13.8h, v1.8h
-            smlsl       v8.4s, v14.4h, v2.4h
-            smlsl2      v9.4s, v14.8h, v2.8h
-            smlal       v8.4s, v15.4h, v3.4h
-            smlal2      v9.4s, v15.8h, v3.8h
-
-            subs        x13, x13, #LOOP_OUTPUT_SIZE
-
-            sqrshrn     v8.4h, v8.4s, #15
-            sqrshrn2    v8.8h, v9.4s, #15
-
-            sqrshrun    v8.8b, v8.8h, #VERTBITS - 8
-.elseif \comp == 2
-            /* The uchar2 case:
-             * This time load pairs of values into adjacent lanes in v12 - v15
-             * by aliasing them as u32 data; leaving room for only four pixels,
-             * so the process has to be done twice.  This also means that the
-             * coefficient registers fail to align with the coefficient data
-             * (eight separate pixels), so that has to be doubled-up to match.
-             */
-            ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
-            lsr         x14, x2, #(63 - CHUNKSHIFT)
-            add         x2, x2, x3
-            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
-            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
-            lsr         x15, x2, #(63 - CHUNKSHIFT)
-            add         x2, x2, x3
-            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
-            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
-            lsr         x16, x2, #(63 - CHUNKSHIFT)
-            add         x2, x2, x3
-            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
-            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
-            lsr         x17, x2, #(63 - CHUNKSHIFT)
-            add         x2, x2, x3
-
-            /* double-up coefficients to align with component pairs */
-            zip1        v16.8h, v0.8h, v0.8h
-            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
-            zip1        v17.8h, v1.8h, v1.8h
-            zip1        v18.8h, v2.8h, v2.8h
-            zip1        v19.8h, v3.8h, v3.8h
-
-            smull       v8.4s, v12.4h, v16.4h
-            smull2      v9.4s, v12.8h, v16.8h
-            smlsl       v8.4s, v13.4h, v17.4h
-            smlsl2      v9.4s, v13.8h, v17.8h
-            smlsl       v8.4s, v14.4h, v18.4h
-            smlsl2      v9.4s, v14.8h, v18.8h
-            smlal       v8.4s, v15.4h, v19.4h
-            smlal2      v9.4s, v15.8h, v19.8h
-
-            sqrshrn     v8.4h, v8.4s, #15
-            sqrshrn2    v8.8h, v9.4s, #15
-
-            ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
-            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
-            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
-            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
-
-            /* double-up coefficients to align with component pairs */
-            zip2        v16.8h, v0.8h, v0.8h
-            zip2        v17.8h, v1.8h, v1.8h
-            zip2        v18.8h, v2.8h, v2.8h
-            zip2        v19.8h, v3.8h, v3.8h
-
-            smull       v10.4s, v12.4h, v16.4h
-            smull2      v11.4s, v12.8h, v16.8h
-            smlsl       v10.4s, v13.4h, v17.4h
-            smlsl2      v11.4s, v13.8h, v17.8h
-            smlsl       v10.4s, v14.4h, v18.4h
-            smlsl2      v11.4s, v14.8h, v18.8h
-            smlal       v10.4s, v15.4h, v19.4h
-            smlal2      v11.4s, v15.8h, v19.8h
-
-            subs        x13, x13, #LOOP_OUTPUT_SIZE
-
-            sqrshrn     v9.4h, v10.4s, #15
-            sqrshrn2    v9.8h, v11.4s, #15
-
-            sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
-            sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
-.elseif \comp == 4
-            /* The uchar4 case.
-             * This case is comparatively painless because four s16s are the
-             * smallest addressable unit for a vmul-by-scalar.  Rather than
-             * permute the data, simply arrange the multiplies to suit the way
-             * the data comes in.  That's a lot of data, though, so things
-             * progress in pairs of pixels at a time.
-             */
-            ld1         {v12.8h,v13.8h}, [x14]
-            lsr         x14, x2, #(63 - CHUNKSHIFT)
-            add         x2, x2, x3
-            ld1         {v14.8h,v15.8h}, [x15]
-            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
-            lsr         x15, x2, #(63 - CHUNKSHIFT)
-            add         x2, x2, x3
-
-            smull       v8.4s, v12.4h, v0.h[0]
-            smull       v9.4s, v14.4h, v0.h[1]
-            smlsl2      v8.4s, v12.8h, v1.h[0]
-            smlsl2      v9.4s, v14.8h, v1.h[1]
-            smlsl       v8.4s, v13.4h, v2.h[0]
-            smlsl       v9.4s, v15.4h, v2.h[1]
-            smlal2      v8.4s, v13.8h, v3.h[0]
-            smlal2      v9.4s, v15.8h, v3.h[1]
-
-            /* And two more...  */
-            ld1         {v12.8h,v13.8h}, [x16]
-            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
-            lsr         x16, x2, #(63 - CHUNKSHIFT)
-            add         x2, x2, x3
-            ld1         {v14.8h,v15.8h}, [x17]
-            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
-            lsr         x17, x2, #(63 - CHUNKSHIFT)
-            add         x2, x2, x3
-
-            sqrshrn     v8.4h, v8.4s, #15
-            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
-            sqrshrn2    v8.8h, v9.4s, #15
-
-            smull       v10.4s, v12.4h, v0.h[2]
-            smull       v11.4s, v14.4h, v0.h[3]
-            smlsl2      v10.4s, v12.8h, v1.h[2]
-            smlsl2      v11.4s, v14.8h, v1.h[3]
-            smlsl       v10.4s, v13.4h, v2.h[2]
-            smlsl       v11.4s, v15.4h, v2.h[3]
-            smlal2      v10.4s, v13.8h, v3.h[2]
-            smlal2      v11.4s, v15.8h, v3.h[3]
-
-            sqrshrn     v9.4h, v10.4s, #15
-            sqrshrn2    v9.8h, v11.4s, #15
-
-            sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
-            sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
-
-            /* And two more...  */
-            ld1         {v12.8h,v13.8h}, [x14]
-            ld1         {v14.8h,v15.8h}, [x15]
-
-            smull       v10.4s, v12.4h, v0.h[4]
-            smull       v11.4s, v14.4h, v0.h[5]
-            smlsl2      v10.4s, v12.8h, v1.h[4]
-            smlsl2      v11.4s, v14.8h, v1.h[5]
-            smlsl       v10.4s, v13.4h, v2.h[4]
-            smlsl       v11.4s, v15.4h, v2.h[5]
-            smlal2      v10.4s, v13.8h, v3.h[4]
-            smlal2      v11.4s, v15.8h, v3.h[5]
-
-            /* And two more...  */
-            ld1         {v12.8h,v13.8h}, [x16]
-            ld1         {v14.8h,v15.8h}, [x17]
-
-            subs        x13, x13, #LOOP_OUTPUT_SIZE
-
-            sqrshrn     v9.4h, v10.4s, #15
-            sqrshrn2    v9.8h, v11.4s, #15
-
-            smull       v10.4s, v12.4h, v0.h[6]
-            smull       v11.4s, v14.4h, v0.h[7]
-            smlsl2      v10.4s, v12.8h, v1.h[6]
-            smlsl2      v11.4s, v14.8h, v1.h[7]
-            smlsl       v10.4s, v13.4h, v2.h[6]
-            smlsl       v11.4s, v15.4h, v2.h[7]
-            smlal2      v10.4s, v13.8h, v3.h[6]
-            smlal2      v11.4s, v15.8h, v3.h[7]
-
-            sqrshrn     v10.4h, v10.4s, #15
-            sqrshrn2    v10.8h, v11.4s, #15
-
-            sqrshrun     v9.8b, v9.8h, #VERTBITS - 8
-            sqrshrun2    v9.16b, v10.8h, #VERTBITS - 8
-.endif
-            bgt         2b      /* continue inner loop */
-            /* The inner loop has already been limited to ensure that none of
-             * the earlier iterations could overfill the output, so the store
-             * appears within the loop but after the conditional branch (at the
-             * top).  At the end, provided it won't overfill, perform the final
-             * store here.  If it would, then break out to the tricky tail case
-             * instead.
-             */
-            blt         1f
-            /* Store the amount of data appropriate to the configuration of the
-             * instance being assembled.
-             */
-.if LOOP_OUTPUT_SIZE == 4
-            st1         {v8.s}[0], [x0], #4
-.elseif LOOP_OUTPUT_SIZE == 8
-            st1         {v8.8b}, [x0], #8
-.elseif LOOP_OUTPUT_SIZE == 16
-            st1         {v8.16b}, [x0], #16
-.elseif LOOP_OUTPUT_SIZE == 32
-            st1         {v8.16b,v9.16b}, [x0], #32
-.endif
-            b           1b              /* resume outer loop */
-            /* Partial tail store case:
-             * Different versions of the code need different subsets of the
-             * following partial stores.  Here the number of components and the
-             * size of the chunk of data produced by each inner loop iteration
-             * is tested to figure out whether or not each phrase is relevant.
-             */
-.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
-1:          tst         x13, #16
-            beq         1f
-            st1         {v8.16b}, [x0], #16
-            mov         v8.16b, v9.16b
-.endif
-.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
-1:          tst         x13, #8
-            beq         1f
-            st1         {v8.8b}, [x0], #8
-            ext         v8.16b, v8.16b, v8.16b, #8
-.endif
-.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
-1:          tst         x13, #4
-            beq         1f
-            st1         {v8.s}[0], [x0], #4
-            ext         v8.8b, v8.8b, v8.8b, #4
-.endif
-.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
-1:          tst         x13, #2
-            beq         1f
-            st1         {v8.h}[0], [x0], #2
-            ext         v8.8b, v8.8b, v8.8b, #2
-.endif
-.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
-1:          tst         x13, #1
-            beq         1f
-            st1         {v8.b}[0], [x0], #1
-.endif
-1:
-9:          mov         sp, x19
-            ld1         {v8.1d - v11.1d}, [sp], #32
-            ld1         {v12.1d - v15.1d}, [sp], #32
-            ldr         x19, [sp], #16
-            ret
-END(rsdIntrinsicResizeB\comp\()_K)
-.endr
-
-.rodata
-intrinsic_resize_consts:          .hword      0, 1, 2, 3, 4, 5, 6, 7

diff --git a/toolkit/Resize_neon.S b/toolkit/Resize_neon.S
deleted file mode 100644
index eb7f694..0000000
--- a/toolkit/Resize_neon.S
+++ /dev/null

@@ -1,799 +0,0 @@
-/*
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
-#define END(f) .fnend; .size f, .-f;
-
-.eabi_attribute 25,1 @Tag_ABI_align8_preserved
-.arm
-
-/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
- * integer (bicubic has a little overshoot).  It would also be possible to add
- * a temporary DC bias to eliminate the sign bit for more precision, but that's
- * extra arithmetic.
- */
-.set VERTBITS, 14
-
-/* The size of the scratch buffer in which we store our vertically convolved
- * intermediates.
- */
-.set CHUNKSHIFT, 7
-.set CHUNKSIZE, (1 << CHUNKSHIFT)
-
-/* The number of components processed in a single iteration of the innermost
- * loop.
- */
-.set VECSHIFT, 3
-.set VECSIZE, (1<<VECSHIFT)
-
-/* Read four different lines (except at edges where addresses may be clamped,
- * which is why we don't simply take base and stride registers), and multiply
- * and accumulate them by the coefficients in d6[0..3], leaving the results in
- * q12.  This gives eight 16-bit results representing a horizontal line of 2-8
- * input pixels (depending on number of components per pixel) to be fed into
- * the horizontal scaling pass.
- *
- * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
- * known to represent negative values and VMLS is used to implement this).
- * Output is VERTBITS signed fixed-point, which must leave room for a little
- * bit of overshoot beyond [0,1.0).
- */
-.macro vert8, dstlo=d24, dsthi=d25
-        vld1.u8     d16, [r4]!
-        vld1.u8     d18, [r5]!
-        vld1.u8     d20, [r6]!
-        vld1.u8     d22, [r7]!
-        vmovl.u8    q8, d16
-        vmovl.u8    q9, d18
-        vmovl.u8    q10, d20
-        vmovl.u8    q11, d22
-        vmull.u16   q12, d18, d6[1]
-        vmull.u16   q13, d19, d6[1]
-        vmlsl.u16   q12, d16, d6[0]
-        vmlsl.u16   q13, d17, d6[0]
-        vmlal.u16   q12, d20, d6[2]
-        vmlal.u16   q13, d21, d6[2]
-        vmlsl.u16   q12, d22, d6[3]
-        vmlsl.u16   q13, d23, d6[3]
-
-        /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
-         * minus VERTBITS (the number of fraction bits we want to keep from
-         * here on).
-         */
-        vqshrn.s32  \dstlo, q12, #8 + 16 - VERTBITS
-        vqshrn.s32  \dsthi, q13, #8 + 16 - VERTBITS
-.endm
-
-/* As above, but only four 16-bit results into d25.
- */
-.macro vert4
-        vld1.u32    d16[0], [r4]!
-        vld1.u32    d18[0], [r5]!
-        vld1.u32    d20[0], [r6]!
-        vld1.u32    d22[0], [r7]!
-        vmovl.u8    q8, d16
-        vmovl.u8    q9, d18
-        vmovl.u8    q10, d20
-        vmovl.u8    q11, d22
-        vmull.u16   q12, d18, d6[1]
-        vmlsl.u16   q12, d16, d6[0]
-        vmlal.u16   q12, d20, d6[2]
-        vmlsl.u16   q12, d22, d6[3]
-        vqshrn.s32  d25, q12, #8 + 16 - VERTBITS
-.endm
-
-
-/* During horizontal resize having CHUNKSIZE input available means being able
- * to produce a varying amount of output, depending on the phase of the data.
- * This function calculates the minimum number of VECSIZE chunks extracted from
- * a CHUNKSIZE window (r1), and the threshold value for when the count will be
- * one higher than that (r0).
- * These work out, conveniently, to be the quotient and remainder from:
- *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
- *
- * The two values can be packed together in a uint64_t for convenience; and
- * they are, in fact, used this way as an arithmetic short-cut later on.
- */
-
-/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc); */
-ENTRY(rsdIntrinsicResize_oscctl_K)
-        lsl         r2, r0, #VECSHIFT
-        movw        r0, #:lower16:(CHUNKSIZE << 16) - 1
-        movt        r0, #:upper16:(CHUNKSIZE << 16) - 1
-        add         r0, r0, r2
-#if defined(ARCH_ARM_USE_UDIV)
-        udiv        r1, r0, r2
-        mls         r0, r1, r2, r0
-#else
-        clz         r3, r2
-        clz         r1, r0
-        subs        r3, r3, r1
-        movlt       r3, #0
-        mov         r1, #1
-        lsl         r2, r2, r3
-        lsl         r3, r1, r3
-        mov         r1, #0
-1:      cmp         r2, r0
-        addls       r1, r3
-        subls       r0, r2
-        lsrs        r3, r3, #1
-        lsr         r2, r2, #1
-        bne         1b
-#endif
-        bx          lr
-END(rsdIntrinsicResize_oscctl_K)
-
-/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
- * For the most part the vertical pass (the outer loop) is the same for all
- * versions.  Exceptions are handled in-line with conditional assembly.
- */
-.irp comp, 1, 2, 4
-.if \comp == 1
-.set COMPONENT_SHIFT, 0
-.elseif \comp == 2
-.set COMPONENT_SHIFT, 1
-.elseif \comp == 4
-.set COMPONENT_SHIFT, 2
-.else
-.error "Unknown component count"
-.endif
-.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
-.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
-
-.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
-.set OSC_STORE, (BUFFER_SIZE + 0)
-.set OSCSTEP_STORE, (BUFFER_SIZE + 4)
-.set OSCCTL_STORE, (BUFFER_SIZE + 8)
-.set AVAIL_STORE, (BUFFER_SIZE + 16)
-.set SP_STORE, (BUFFER_SIZE + 24)   /* should be +20, but rounded up to make a legal constant somewhere */
-
-/* void rsdIntrinsicResizeB\comp\()_K(
- *             uint8_t * restrict dst,          // r0
- *             size_t count,                    // r1
- *             uint32_t xf,                     // r2
- *             uint32_t xinc,                   // r3
- *             uint8_t const * restrict srcn,   // [sp]     -> [sp,#104] -> r4
- *             uint8_t const * restrict src0,   // [sp,#4]  -> [sp,#108] -> r5
- *             uint8_t const * restrict src1,   // [sp,#8]  -> [sp,#112] -> r6
- *             uint8_t const * restrict src2,   // [sp,#12] -> [sp,#116] -> r7
- *             size_t xclip,                    // [sp,#16] -> [sp,#120]
- *             size_t avail,                    // [sp,#20] -> [sp,#124] -> lr
- *             uint64_t osc_ctl,                // [sp,#24] -> [sp,#128]
- *             int32_t const *yr);              // [sp,#32] -> [sp,#136] -> d8 (copied to d6 for scalar access)
- */
-ENTRY(rsdIntrinsicResizeB\comp\()_K)
-            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-            vpush       {d8-d15}
-
-            /* align the working buffer on the stack to make it easy to use bit
-             * twiddling for address calculations and bounds tests.
-             */
-            sub         r12, sp, #BUFFER_SIZE + 32
-            mov         lr, sp
-            bfc         r12, #0, #CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1
-            mov         sp, r12
-            str         lr, [sp,#SP_STORE]
-
-            ldr         r8, [lr,#136]           // yr
-            adr         r9, 8f
-            vld1.s32    {q4}, [r8]
-            vld1.s16    {q5}, [r9]
-            vqmovun.s32 d8, q4                  // yr
-            vdup.s16    q6, r2
-            vdup.s16    q7, r3
-            vmla.s16    q6, q5, q7              // vxf
-            vshl.s16    q7, q7, #VECSHIFT       // vxinc
-
-            ldrd        r4,r5, [lr,#104]        // srcn, src0
-            ldrd        r6,r7, [lr,#112]        // src1, src2
-
-            /* Compute starting condition for oscillator used to compute ahead
-             * of time how many iterations are possible before needing to
-             * refill the working buffer.  This is based on the fixed-point
-             * index of the last element in the vector of pixels processed in
-             * each iteration, counting up until it would overflow.
-             */
-            sub         r8, r2, r3
-            mov         r9, r3, LSL #VECSHIFT
-            add         r8, r8, r9
-
-            ldrd        r10,r11, [lr,#128]      // osc_ctl
-
-            str         r8, [sp,#OSC_STORE]
-            str         r9, [sp,#OSCSTEP_STORE]
-            str         r10, [sp,#OSCCTL_STORE]
-            str         r11, [sp,#OSCCTL_STORE+4]
-            ldrd        r10,r11, [lr,#120]      // xclip,avail
-
-
-            /* r4-r7 contain pointers to the four lines of input to be
-             * convolved.  These pointers have been clamped vertically and
-             * horizontally (which is why it's not a simple row/stride pair),
-             * and the xclip argument (now in r10) indicates how many pixels
-             * from true the x position of the pointer is.  This value should
-             * be 0, 1, or 2 only.
-             *
-             * Start by placing four pixels worth of input at the far end of
-             * the buffer.  As many as two of these may be clipped, so four
-             * pixels are fetched, and then the first pixel is duplicated and
-             * the data shifted according to xclip.  The source pointers are
-             * then also adjusted according to xclip so that subsequent fetches
-             * match.
-             */
-            vmov        d6, d8  /* make y coeffs available for vert4 and vert8 macros */
-
-            sub         r8, r12, r10, LSL #COMPONENT_SHIFT + 1
-            add         r9, r12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
-            add         r8, r8, #4 * COMPONENT_COUNT * 2
-.if \comp == 1
-            vert4
-            vdup.s16    d24, d25[0]
-            vst1.s16    {q12}, [r12]
-            vld1.s16    {d24}, [r8]
-            vst1.s16    {d24}, [r9]
-.elseif \comp == 2
-            vert8
-            vdup.u32    q11, d24[0]
-            vst1.s16    {q11,q12}, [r12]
-            vld1.s16    {q12}, [r8]
-            vst1.s16    {q12}, [r9]
-.elseif \comp == 4
-            vert8       d28, d29
-            vert8       d30, d31
-            vmov.u64    d24, d28
-            vmov.u64    d25, d28
-            vmov.u64    d26, d28
-            vmov.u64    d27, d28
-            vst1.s16    {q12,q13}, [r12]!
-            vst1.s16    {q14,q15}, [r12]
-            sub         r12, r12, #32
-            vld1.s16    {q11,q12}, [r8]
-            vst1.s16    {q11,q12}, [r9]
-.endif
-            /* Count off four pixels into the working buffer, and move count to
-             * its new home.
-             */
-            sub         lr, r11, #4
-            /* Incoming pointers were to the first _legal_ pixel.  Four pixels
-             * were read unconditionally, but some may have been discarded by
-             * xclip, so we rewind the pointers to compensate.
-             */
-            sub         r4, r4, r10, LSL #COMPONENT_SHIFT
-            sub         r5, r5, r10, LSL #COMPONENT_SHIFT
-            sub         r6, r6, r10, LSL #COMPONENT_SHIFT
-            sub         r7, r7, r10, LSL #COMPONENT_SHIFT
-
-            /* First tap starts where we just pre-filled, at the end of the
-             * buffer.
-             */
-            add         r2, r2, #(CHUNKSIZE * 2 - 4) << 16
-
-            /* Use overflowing arithmetic to implement wraparound array
-             * indexing.
-             */
-            mov         r2, r2, LSL #(15 - CHUNKSHIFT)
-            mov         r3, r3, LSL #(15 - CHUNKSHIFT)
-
-            str         lr, [sp,#AVAIL_STORE]
-
-            /* Start of outermost loop.
-             * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
-             * number of iterations of the inner loop that can be performed and
-             * get into that.
-             *
-             * The fill is complicated by the possibility of running out of
-             * input before the scratch buffer is filled.  If this isn't a risk
-             * then it's handled by the simple loop at 2:, otherwise the
-             * horrible loop at 3:.
-             */
-1:          ldr         lr, [sp,#AVAIL_STORE]   /* get number of pixels available */
-            vmov        d6, d8              /* put y scaling coefficients somewhere handy */
-            subs        lr, #CHUNKSIZE
-            bge         2f                  /* if at least CHUNKSIZE are available... */
-            add         lr, #CHUNKSIZE      /* if they're not... */
-            b           4f
-            /* ..just sneaking a literal in here after this unconditional branch.. */
-8:          .hword      0, 1, 2, 3, 4, 5, 6, 7
-            /* basic fill loop, processing 8 bytes at a time until there are
-             * fewer than eight bytes available.
-             */
-3:          vert8
-            sub         lr, lr, #8 / COMPONENT_COUNT
-            vst1.s16    {q12}, [r12]!
-4:          cmp         lr, #8 / COMPONENT_COUNT - 1
-            bgt         3b
-.if \comp == 4
-            blt         3f
-            /* The last pixel (four bytes) if necessary */
-            vert4
-.else
-            cmp         lr, #1
-            blt         3f
-            /* The last pixels if necessary */
-            sub         r4, r4, #8
-            sub         r5, r5, #8
-            sub         r6, r6, #8
-            sub         r7, r7, #8
-            add         r4, r4, lr, LSL #COMPONENT_SHIFT
-            add         r5, r5, lr, LSL #COMPONENT_SHIFT
-            add         r6, r6, lr, LSL #COMPONENT_SHIFT
-            add         r7, r7, lr, LSL #COMPONENT_SHIFT
-            vert8
-            sub         lr, sp, lr, LSL #COMPONENT_SHIFT + 1
-            sub         sp, sp, #32
-            sub         lr, lr, #16
-.if \comp == 1
-            vdup.s16    q13, d25[3]
-.elseif \comp == 2
-            vdup.u32    q13, d25[1]
-.endif
-            vst1.s16    {q12,q13}, [sp]
-            vld1.s16    {q12}, [lr]
-            add         sp, sp, #32
-            b           4f
-.endif
-            /* Keep filling until we get to the end of this chunk of the buffer */
-3:
-.if \comp == 1
-            vdup.s16    q12, d25[3]
-.elseif \comp == 2
-            vdup.u32    q12, d25[1]
-.elseif \comp == 4
-            vmov.u64    d24, d25
-.endif
-4:          vst1.s16    {q12}, [r12]!
-            tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
-            bne         3b
-            b           4f
-
-.align 4
-2:          /* Quickly pull a chunk of data into the working buffer.
-             */
-            vert8
-            vst1.s16    {q12}, [r12]!
-            vert8
-            vst1.s16    {q12}, [r12]!
-            tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
-            bne         2b
-            cmp         lr, #0
-            bne         3f
-4:          /* if we end with 0 pixels left we'll have nothing handy to spread
-             * across to the right, so we rewind a bit.
-             */
-            mov         lr, #1
-            sub         r4, r4, #COMPONENT_COUNT
-            sub         r5, r5, #COMPONENT_COUNT
-            sub         r6, r6, #COMPONENT_COUNT
-            sub         r7, r7, #COMPONENT_COUNT
-3:          str         lr, [sp,#AVAIL_STORE]       /* done with available pixel count */
-            add         lr, sp, #OSC_STORE
-            ldrd        r8,r9, [lr,#0]              /* need osc, osc_step soon */
-            ldrd        r10,r11, [lr,#OSCCTL_STORE-OSC_STORE] /* need osc_ctl too */
-
-            /* copy four taps (width of cubic window) to far end for overflow
-             * address handling
-             */
-            sub         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
-            eor         r12, lr, #CHUNKSIZE * COMPONENT_COUNT * 2
-.if \comp == 1
-            vld1.s16    {d28}, [lr]
-.elseif \comp == 2
-            vld1.s16    {q14}, [lr]
-.elseif \comp == 4
-            vld1.s16    {q14,q15}, [lr]
-.endif
-            add         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
-.if \comp == 1
-            vst1.s16    {d28}, [lr]
-.elseif \comp == 2
-            vst1.s16    {q14}, [lr]
-.elseif \comp == 4
-            vst1.s16    {q14,q15}, [lr]
-.endif
-            /* r11 contains the maximum possible iteration count, but if r8 is
-             * greater than r10 then this indicates that the count must be
-             * reduced by one for this iteration to avoid reading past the end
-             * of the available data.
-             */
-            cmp             r10, r8
-            sbc         lr, r11, #0
-
-            mla         r8, lr, r9, r8
-            sub         r8, r8, #(CHUNKSIZE << 16)
-
-            str         r8, [sp,#OSC_STORE]         /* done with osc */
-
-            /* prefer to count pixels, rather than vectors, to clarify the tail
-             * store case on exit.
-             */
-            mov         lr, lr, LSL #VECSHIFT
-            cmp         lr, r1
-            movgt       lr, r1
-
-            sub         r1, r1, lr
-
-            mov         lr, lr, LSL #COMPONENT_SHIFT
-
-            vmov.i16    d10, #3
-            vmov.i16    d11, #0x8000
-
-            cmp         lr, #0
-            bgt         3f
-            cmp         r1, #0
-            bgt         1b     /* an extreme case where we shouldn't use code in this structure */
-            b           9f
-
-            .align 4
-2:          /* Inner loop continues here, but starts at 3:, see end of loop
-             * below for explanation. */
-.if LOOP_OUTPUT_SIZE == 4
-            vst1.u32    {d16[0]}, [r0]!
-.elseif LOOP_OUTPUT_SIZE == 8
-            vst1.u8     {d16}, [r0]!
-.elseif LOOP_OUTPUT_SIZE == 16
-            vst1.u8     {q8}, [r0]!
-.elseif LOOP_OUTPUT_SIZE == 32
-            vst1.u8     {q8,q9}, [r0]!
-.endif
-            /* Inner loop:  here the four x coefficients for each tap are
-             * calculated in vector code, and the addresses are calculated in
-             * scalar code, and these calculations are interleaved.
-             */
-3:          vshr.u16    q8, q6, #1
-            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
-            vqrdmulh.s16 q9, q8, q8
-            add         r2, r2, r3
-            vqrdmulh.s16 q10, q9, q8
-            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
-            vshll.s16   q11, d18, #2
-            vshll.s16   q12, d19, #2
-            add         r2, r2, r3
-            vmlsl.s16   q11, d20, d10
-            vmlsl.s16   q12, d21, d10
-            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
-
-            vhadd.s16   q0, q10, q8
-            add         r2, r2, r3
-            vsub.s16    q0, q9, q0
-            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
-
-            vaddw.s16   q1, q11, d18
-            vaddw.s16   q13, q12, d19
-            add         r2, r2, r3
-            vshrn.s32   d2, q1, #1
-            vshrn.s32   d3, q13, #1
-            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
-            vsub.s16    d2, d2, d11
-            vsub.s16    d3, d3, d11 // TODO: find a wider d11 and use q-reg operation
-            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
-
-            vaddw.s16   q2, q11, d16
-            vaddw.s16   q13, q12, d17
-            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
-            vshrn.s32   d4, q2, #1
-            vshrn.s32   d5, q13, #1
-            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
-            vneg.s16    q2, q2
-
-            vhsub.s16   q3, q10, q9
-
-            /* increment the x fractional parts (oveflow is ignored, as the
-             * scalar arithmetic shadows this addition with full precision).
-             */
-            vadd.s16    q6, q6, q7
-
-            /* At this point we have four pointers in r8-r11, pointing to the
-             * four taps in the scratch buffer that must be convolved together
-             * to produce an output pixel (one output pixel per pointer).
-             * These pointers usually overlap, but their spacing is irregular
-             * so resolving the redundancy through L1 is a pragmatic solution.
-             *
-             * The scratch buffer is made of signed 16-bit data, holding over
-             * some extra precision, and overshoot, from the vertical pass.
-             *
-             * We also have the 16-bit unsigned fixed-point weights for each
-             * of the four taps in q0 - q3.  That's eight pixels worth of
-             * coefficients when we have only four pointers, so calculations
-             * for four more pixels are interleaved with the fetch and permute
-             * code for each variant in the following code.
-             *
-             * The data arrangement is less than ideal for any pixel format,
-             * but permuting loads help to mitigate most of the problems.
-             *
-             * Note also that the two outside taps of a bicubic are negative,
-             * but these coefficients are unsigned.  The sign is hard-coded by
-             * use of multiply-and-subtract operations.
-             */
-.if \comp == 1
-            /* The uchar 1 case.
-             * Issue one lanewise vld4.s16 to load four consecutive pixels from
-             * one pointer (one pixel) into four different registers; then load
-             * four consecutive s16 values from the next pointer (pixel) into
-             * the next lane of those four registers, etc., so that we finish
-             * with q12 - q15 representing the four taps, and each lane
-             * representing a separate pixel.
-             *
-             * The first vld4 uses a splat to avoid any false dependency on
-             * the previous state of the register.
-             */
-            vld4.s16    {d24[],d26[],d28[],d30[]}, [r8]
-            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
-            add         r2, r2, r3
-            vld4.s16    {d24[1],d26[1],d28[1],d30[1]}, [r9]
-            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
-            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
-            add         r2, r2, r3
-            vld4.s16    {d24[2],d26[2],d28[2],d30[2]}, [r10]
-            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
-            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
-            add         r2, r2, r3
-            vld4.s16    {d24[3],d26[3],d28[3],d30[3]}, [r11]
-            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
-            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
-            add         r2, r2, r3
-            vld4.s16    {d25[],d27[],d29[],d31[]}, [r8]
-            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
-            vld4.s16    {d25[1],d27[1],d29[1],d31[1]}, [r9]
-            vld4.s16    {d25[2],d27[2],d29[2],d31[2]}, [r10]
-            vld4.s16    {d25[3],d27[3],d29[3],d31[3]}, [r11]
-
-            vmull.s16   q8, d24, d0
-            vmull.s16   q9, d25, d1
-            vmlsl.s16   q8, d26, d2
-            vmlsl.s16   q9, d27, d3
-            vmlsl.s16   q8, d28, d4
-            vmlsl.s16   q9, d29, d5
-            vmlal.s16   q8, d30, d6
-            vmlal.s16   q9, d31, d7
-
-            subs        lr, lr, #LOOP_OUTPUT_SIZE
-
-            vqrshrn.s32 d16, q8, #15
-            vqrshrn.s32 d17, q9, #15
-
-            vqrshrun.s16 d16, q8, #VERTBITS - 8
-.elseif \comp == 2
-            /* The uchar2 case:
-             * This time load pairs of values into adjacent lanes in q12 - q15
-             * by aliasing them as u32 data; leaving room for only four pixels,
-             * so the process has to be done twice.  This also means that the
-             * coefficient registers fail to align with the coefficient data
-             * (eight separate pixels), so that has to be doubled-up to match.
-             */
-            vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
-            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
-            add         r2, r2, r3
-            vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
-            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
-            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
-            add         r2, r2, r3
-            vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
-            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
-            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
-            add         r2, r2, r3
-            vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]
-            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
-            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
-            add         r2, r2, r3
-
-            /* double-up coefficients to align with component pairs */
-            vmov        d20, d0
-            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
-            vmov        d21, d2
-            vmov        d22, d4
-            vmov        d23, d6
-            vzip.s16    d0, d20
-            vzip.s16    d2, d21
-            vzip.s16    d4, d22
-            vzip.s16    d6, d23
-
-            vmull.s16   q8, d24, d0
-            vmull.s16   q9, d25, d20
-            vmlsl.s16   q8, d26, d2
-            vmlsl.s16   q9, d27, d21
-            vmlsl.s16   q8, d28, d4
-            vmlsl.s16   q9, d29, d22
-            vmlal.s16   q8, d30, d6
-            vmlal.s16   q9, d31, d23
-
-            vqrshrn.s32 d16, q8, #15
-            vqrshrn.s32 d17, q9, #15
-
-            vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
-            vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
-            vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
-            vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]
-
-            /* double-up coefficients to align with component pairs */
-            vmov        d0, d1
-            vmov        d2, d3
-            vmov        d4, d5
-            vmov        d6, d7
-            vzip.s16    d0, d1
-            vzip.s16    d2, d3
-            vzip.s16    d4, d5
-            vzip.s16    d6, d7
-
-            vmull.s16   q10, d24, d0
-            vmull.s16   q11, d25, d1
-            vmlsl.s16   q10, d26, d2
-            vmlsl.s16   q11, d27, d3
-            vmlsl.s16   q10, d28, d4
-            vmlsl.s16   q11, d29, d5
-            vmlal.s16   q10, d30, d6
-            vmlal.s16   q11, d31, d7
-
-            subs        lr, lr, #LOOP_OUTPUT_SIZE
-
-            vqrshrn.s32 d18, q10, #15
-            vqrshrn.s32 d19, q11, #15
-
-            vqrshrun.s16 d16, q8, #VERTBITS - 8
-            vqrshrun.s16 d17, q9, #VERTBITS - 8
-.elseif \comp == 4
-            /* The uchar4 case.
-             * This case is comparatively painless because four s16s are the
-             * smallest addressable unit for a vmul-by-scalar.  Rather than
-             * permute the data, simply arrange the multiplies to suit the way
-             * the data comes in.  That's a lot of data, though, so things
-             * progress in pairs of pixels at a time.
-             */
-            vld1.s16    {q12,q13}, [r8]
-            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
-            add         r2, r2, r3
-            vld1.s16    {q14,q15}, [r9]
-            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
-            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
-            add         r2, r2, r3
-
-            vmull.s16   q8, d24, d0[0]
-            vmull.s16   q9, d28, d0[1]
-            vmlsl.s16   q8, d25, d2[0]
-            vmlsl.s16   q9, d29, d2[1]
-            vmlsl.s16   q8, d26, d4[0]
-            vmlsl.s16   q9, d30, d4[1]
-            vmlal.s16   q8, d27, d6[0]
-            vmlal.s16   q9, d31, d6[1]
-
-            /* And two more...  */
-            vld1.s16    {q12,q13}, [r10]
-            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
-            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
-            add         r2, r2, r3
-            vld1.s16    {q14,q15}, [r11]
-            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
-            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
-            add         r2, r2, r3
-
-            vqrshrn.s32 d16, q8, #15
-            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
-            vqrshrn.s32 d17, q9, #15
-
-            vmull.s16   q10, d24, d0[2]
-            vmull.s16   q11, d28, d0[3]
-            vmlsl.s16   q10, d25, d2[2]
-            vmlsl.s16   q11, d29, d2[3]
-            vmlsl.s16   q10, d26, d4[2]
-            vmlsl.s16   q11, d30, d4[3]
-            vmlal.s16   q10, d27, d6[2]
-            vmlal.s16   q11, d31, d6[3]
-
-            vqrshrn.s32 d18, q10, #15
-            vqrshrn.s32 d19, q11, #15
-
-            vqrshrun.s16 d16, q8, #VERTBITS - 8
-            vqrshrun.s16 d17, q9, #VERTBITS - 8
-
-            /* And two more...  */
-            vld1.s16    {q12,q13}, [r8]
-            vld1.s16    {q14,q15}, [r9]
-
-            vmull.s16   q10, d24, d1[0]
-            vmull.s16   q11, d28, d1[1]
-            vmlsl.s16   q10, d25, d3[0]
-            vmlsl.s16   q11, d29, d3[1]
-            vmlsl.s16   q10, d26, d5[0]
-            vmlsl.s16   q11, d30, d5[1]
-            vmlal.s16   q10, d27, d7[0]
-            vmlal.s16   q11, d31, d7[1]
-
-            /* And two more...  */
-            vld1.s16    {q12,q13}, [r10]
-            vld1.s16    {q14,q15}, [r11]
-
-            subs        lr, lr, #LOOP_OUTPUT_SIZE
-
-            vqrshrn.s32 d18, q10, #15
-            vqrshrn.s32 d19, q11, #15
-
-            vmull.s16   q10, d24, d1[2]
-            vmull.s16   q11, d28, d1[3]
-            vmlsl.s16   q10, d25, d3[2]
-            vmlsl.s16   q11, d29, d3[3]
-            vmlsl.s16   q10, d26, d5[2]
-            vmlsl.s16   q11, d30, d5[3]
-            vmlal.s16   q10, d27, d7[2]
-            vmlal.s16   q11, d31, d7[3]
-
-            vqrshrn.s32 d20, q10, #15
-            vqrshrn.s32 d21, q11, #15
-
-            vqrshrun.s16 d18, q9, #VERTBITS - 8
-            vqrshrun.s16 d19, q10, #VERTBITS - 8
-.endif
-            bgt         2b      /* continue inner loop */
-            /* The inner loop has already been limited to ensure that none of
-             * the earlier iterations could overfill the output, so the store
-             * appears within the loop but after the conditional branch (at the
-             * top).  At the end, provided it won't overfill, perform the final
-             * store here.  If it would, then break out to the tricky tail case
-             * instead.
-             */
-            blt         1f
-            /* Store the amount of data appropriate to the configuration of the
-             * instance being assembled.
-             */
-.if LOOP_OUTPUT_SIZE == 4
-            vst1.u32    {d16[0]}, [r0]!
-.elseif LOOP_OUTPUT_SIZE == 8
-            vst1.u8     {d16}, [r0]!
-.elseif LOOP_OUTPUT_SIZE == 16
-            vst1.u8     {q8}, [r0]!
-.elseif LOOP_OUTPUT_SIZE == 32
-            vst1.u8     {q8,q9}, [r0]!
-.endif
-            b           1b              /* resume outer loop */
-            /* Partial tail store case:
-             * Different versions of the code need different subsets of the
-             * following partial stores.  Here the number of components and the
-             * size of the chunk of data produced by each inner loop iteration
-             * is tested to figure out whether or not each phrase is relevant.
-             */
-.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
-1:          tst         lr, #16
-            beq         1f
-            vst1.u8     {q8}, [r0]!
-            vmov        q8, q9
-.endif
-.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
-1:          tst         lr, #8
-            beq         1f
-            vst1.u8     {d16}, [r0]!
-            vmov.u8     d16, d17
-.endif
-.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
-1:          tst         lr, #4
-            beq         1f
-            vst1.u32    {d16[0]}, [r0]!
-            vext.u32    d16, d16, d16, #1
-.endif
-.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
-1:          tst         lr, #2
-            beq         1f
-            vst1.u16    {d16[0]}, [r0]!
-            vext.u16    d16, d16, d16, #1
-.endif
-.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
-1:          tst         lr, #1
-            beq         1f
-            vst1.u8     {d16[0]}, [r0]!
-.endif
-1:
-9:          ldr         sp, [sp,#SP_STORE]
-            vpop        {d8-d15}
-            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-END(rsdIntrinsicResizeB\comp\()_K)
-.endr

diff --git a/toolkit/TaskProcessor.cpp b/toolkit/TaskProcessor.cpp
deleted file mode 100644
index d9ae83c..0000000
--- a/toolkit/TaskProcessor.cpp
+++ /dev/null

@@ -1,221 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "TaskProcessor.h"
-
-#include <assert.h>
-#include <sys/prctl.h>
-
-#include "RenderScriptToolkit.h"
-#include "Utils.h"
-
-#define LOG_TAG "renderscript.toolkit.TaskProcessor"
-
-namespace android {
-namespace renderscript {
-
-int Task::setTiling(unsigned int targetTileSizeInBytes) {
-    // Empirically, values smaller than 1000 are unlikely to give good performance.
-    targetTileSizeInBytes = std::max(1000u, targetTileSizeInBytes);
-    const size_t cellSizeInBytes =
-            mVectorSize;  // If we add float support, vectorSize * 4 for that.
-    const size_t targetCellsPerTile = targetTileSizeInBytes / cellSizeInBytes;
-    assert(targetCellsPerTile > 0);
-
-    size_t cellsToProcessY;
-    size_t cellsToProcessX;
-    if (mRestriction == nullptr) {
-        cellsToProcessX = mSizeX;
-        cellsToProcessY = mSizeY;
-    } else {
-        assert(mRestriction->endX > mRestriction->startX);
-        assert(mRestriction->endY > mRestriction->startY);
-        cellsToProcessX = mRestriction->endX - mRestriction->startX;
-        cellsToProcessY = mRestriction->endY - mRestriction->startY;
-    }
-
-    // We want rows as large as possible, as the SIMD code we have is more efficient with
-    // large rows.
-    mTilesPerRow = divideRoundingUp(cellsToProcessX, targetCellsPerTile);
-    // Once we know the number of tiles per row, we divide that row evenly. We round up to make
-    // sure all cells are included in the last tile of the row.
-    mCellsPerTileX = divideRoundingUp(cellsToProcessX, mTilesPerRow);
-
-    // We do the same thing for the Y direction.
-    size_t targetRowsPerTile = divideRoundingUp(targetCellsPerTile, mCellsPerTileX);
-    mTilesPerColumn = divideRoundingUp(cellsToProcessY, targetRowsPerTile);
-    mCellsPerTileY = divideRoundingUp(cellsToProcessY, mTilesPerColumn);
-
-    return mTilesPerRow * mTilesPerColumn;
-}
-
-void Task::processTile(unsigned int threadIndex, size_t tileIndex) {
-    // Figure out the overall boundaries.
-    size_t startWorkX;
-    size_t startWorkY;
-    size_t endWorkX;
-    size_t endWorkY;
-    if (mRestriction == nullptr) {
-        startWorkX = 0;
-        startWorkY = 0;
-        endWorkX = mSizeX;
-        endWorkY = mSizeY;
-    } else {
-        startWorkX = mRestriction->startX;
-        startWorkY = mRestriction->startY;
-        endWorkX = mRestriction->endX;
-        endWorkY = mRestriction->endY;
-    }
-    // Figure out the rectangle for this tileIndex. All our tiles form a 2D grid. Identify
-    // first the X, Y coordinate of our tile in that grid.
-    size_t tileIndexY = tileIndex / mTilesPerRow;
-    size_t tileIndexX = tileIndex % mTilesPerRow;
-    // Calculate the starting and ending point of that tile.
-    size_t startCellX = startWorkX + tileIndexX * mCellsPerTileX;
-    size_t startCellY = startWorkY + tileIndexY * mCellsPerTileY;
-    size_t endCellX = std::min(startCellX + mCellsPerTileX, endWorkX);
-    size_t endCellY = std::min(startCellY + mCellsPerTileY, endWorkY);
-
-    // Call the derived class to do the specific work.
-    if (mPrefersDataAsOneRow && startCellX == 0 && endCellX == mSizeX) {
-        // When the tile covers entire rows, we can take advantage that some ops are not 2D.
-        processData(threadIndex, 0, startCellY, mSizeX * (endCellY - startCellY), startCellY + 1);
-    } else {
-        processData(threadIndex, startCellX, startCellY, endCellX, endCellY);
-    }
-}
-
-TaskProcessor::TaskProcessor(unsigned int numThreads)
-    : mUsesSimd{cpuSupportsSimd()},
-      /* If the requested number of threads is 0, we'll decide based on the number of cores.
-       * Through empirical testing, we've found that using more than 6 threads does not help.
-       * There may be more optimal choices to make depending on the SoC but we'll stick to
-       * this simple heuristic for now.
-       *
-       * We'll re-use the thread that calls the processor doTask method, so we'll spawn one less
-       * worker pool thread than the total number of threads.
-       */
-      mNumberOfPoolThreads{numThreads ? numThreads - 1
-                                      : std::min(6u, std::thread::hardware_concurrency() - 1)} {
-    for (size_t i = 0; i < mNumberOfPoolThreads; i++) {
-        mPoolThreads.emplace_back(
-                std::bind(&TaskProcessor::processTilesOfWork, this, i + 1, false));
-    }
-}
-
-TaskProcessor::~TaskProcessor() {
-    {
-        std::lock_guard<std::mutex> lock(mQueueMutex);
-        mStopThreads = true;
-        mWorkAvailableOrStop.notify_all();
-    }
-
-    for (auto& thread : mPoolThreads) {
-        thread.join();
-    }
-}
-
-void TaskProcessor::processTilesOfWork(int threadIndex, bool returnWhenNoWork) {
-    if (threadIndex != 0) {
-        // Set the name of the thread, except for thread 0, which is not part of the pool.
-        // PR_SET_NAME takes a maximum of 16 characters, including the terminating null.
-        char name[16]{"RenderScToolkit"};
-        prctl(PR_SET_NAME, name, 0, 0, 0);
-        // ALOGI("Starting thread%d", threadIndex);
-    }
-
-    std::unique_lock<std::mutex> lock(mQueueMutex);
-    while (true) {
-        mWorkAvailableOrStop.wait(lock, [this, returnWhenNoWork]() REQUIRES(mQueueMutex) {
-            return mStopThreads || (mTilesNotYetStarted > 0) ||
-                   (returnWhenNoWork && (mTilesNotYetStarted == 0));
-        });
-        // ALOGI("Woke thread%d", threadIndex);
-
-        // This ScopedLockAssertion is to help the compiler when it checks thread annotations
-        // to realize that we have the lock. It's however not completely true; we don't
-        // hold the lock while processing the tile.
-        // TODO Figure out how to fix that.
-        android::base::ScopedLockAssertion lockAssert(mQueueMutex);
-        if (mStopThreads || (returnWhenNoWork && mTilesNotYetStarted == 0)) {
-            break;
-        }
-
-        while (mTilesNotYetStarted > 0 && !mStopThreads) {
-            // This picks the tiles in decreasing order but that does not matter.
-            int myTile = --mTilesNotYetStarted;
-            mTilesInProcess++;
-            lock.unlock();
-            {
-                // We won't be executing this code unless the main thread is
-                // holding the mTaskMutex lock, which guards mCurrentTask.
-                // The compiler can't figure this out.
-                android::base::ScopedLockAssertion lockAssert(mTaskMutex);
-                mCurrentTask->processTile(threadIndex, myTile);
-            }
-            lock.lock();
-            mTilesInProcess--;
-            if (mTilesInProcess == 0 && mTilesNotYetStarted == 0) {
-                mWorkIsFinished.notify_one();
-            }
-        }
-    }
-    // if (threadIndex != 0) {
-    //     ALOGI("Ending thread%d", threadIndex);
-    // }
-}
-
-void TaskProcessor::doTask(Task* task) {
-    std::lock_guard<std::mutex> lockGuard(mTaskMutex);
-    task->setUsesSimd(mUsesSimd);
-    mCurrentTask = task;
-    // Notify the thread pool of available work.
-    startWork(task);
-    // Start processing some of the tiles on the calling thread.
-    processTilesOfWork(0, true);
-    // Wait for all the pool workers to complete.
-    waitForPoolWorkersToComplete();
-    mCurrentTask = nullptr;
-}
-
-void TaskProcessor::startWork(Task* task) {
-    /**
-     * The size in bytes that we're hoping each tile will be. If this value is too small,
-     * we'll spend too much time in synchronization. If it's too large, some cores may be
-     * idle while others still have a lot of work to do. Ideally, it would depend on the
-     * device we're running. 16k is the same value used by RenderScript and seems reasonable
-     * from ad-hoc tests.
-     */
-    const size_t targetTileSize = 16 * 1024;
-
-    std::lock_guard<std::mutex> lock(mQueueMutex);
-    assert(mTilesInProcess == 0);
-    mTilesNotYetStarted = task->setTiling(targetTileSize);
-    mWorkAvailableOrStop.notify_all();
-}
-
-void TaskProcessor::waitForPoolWorkersToComplete() {
-    std::unique_lock<std::mutex> lock(mQueueMutex);
-    // The predicate, i.e. the lambda, will make sure that
-    // we terminate even if the main thread calls this after
-    // mWorkIsFinished is signaled.
-    mWorkIsFinished.wait(lock, [this]() REQUIRES(mQueueMutex) {
-        return mTilesNotYetStarted == 0 && mTilesInProcess == 0;
-    });
-}
-
-}  // namespace renderscript
-}  // namespace android

diff --git a/toolkit/TaskProcessor.h b/toolkit/TaskProcessor.h
deleted file mode 100644
index 4d274fa..0000000
--- a/toolkit/TaskProcessor.h
+++ /dev/null

@@ -1,264 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ANDROID_RENDERSCRIPT_TOOLKIT_TASKPROCESSOR_H
-#define ANDROID_RENDERSCRIPT_TOOLKIT_TASKPROCESSOR_H
-
-#include <android-base/thread_annotations.h>
-
-#include <atomic>
-#include <condition_variable>
-#include <cstddef>
-#include <mutex>
-#include <thread>
-#include <vector>
-
-namespace android {
-namespace renderscript {
-
-/**
- * Description of the data to be processed for one Toolkit method call, e.g. one blur or one
- * blend operation.
- *
- * The data to be processed is a 2D array of cells. Each cell is a vector of 1 to 4 unsigned bytes.
- * The most typical configuration is a 2D array of uchar4 used to represent RGBA images.
- *
- * This is a base class. There will be a subclass for each Toolkit op.
- *
- * Typical usage of a derived class would look like:
- *    BlurTask task(in, out, sizeX, sizeY, vectorSize, etc);
- *    processor->doTask(&task);
- *
- * The TaskProcessor should call setTiling() and setUsesSimd() once, before calling processTile().
- * Other classes should not call setTiling(), setUsesSimd(), and processTile().
- */
-class Task {
-   protected:
-    /**
-     * Number of cells in the X direction.
-     */
-    const size_t mSizeX;
-    /**
-     * Number of cells in the Y direction.
-     */
-    const size_t mSizeY;
-    /**
-     * Number of elements in a vector (cell). From 1-4.
-     */
-    const size_t mVectorSize;
-    /**
-     * Whether the task prefers the processData call to represent the work to be done as
-     * one line rather than a rectangle. This would be the case for work that don't involve
-     * vertical neighbors, e.g. blend or histogram. A task would prefer this to minimize the
-     * number of SIMD calls to make, i.e. have one call that covers all the rows.
-     *
-     * This setting will be used only when a tile covers the entire width of the data to be
-     * processed.
-     */
-    const bool mPrefersDataAsOneRow;
-    /**
-     * Whether the processor we're working on supports SIMD operations.
-     */
-    bool mUsesSimd = false;
-
-   private:
-    /**
-     * If not null, we'll process a subset of the whole 2D array. This specifies the restriction.
-     */
-    const struct Restriction* mRestriction;
-
-    /**
-     * We'll divide the work into rectangular tiles. See setTiling().
-     */
-
-    /**
-     * Size of a tile in the X direction, as a number of cells.
-     */
-    size_t mCellsPerTileX = 0;
-    /**
-     * Size of a tile in the Y direction, as a number of cells.
-     */
-    size_t mCellsPerTileY = 0;
-    /**
-     * Number of tiles per row of the restricted area we're working on.
-     */
-    size_t mTilesPerRow = 0;
-    /**
-     * Number of tiles per column of the restricted area we're working on.
-     */
-    size_t mTilesPerColumn = 0;
-
-   public:
-    /**
-     * Construct a task.
-     *
-     * sizeX and sizeY should be greater than 0. vectorSize should be between 1 and 4.
-     * The restriction should outlive this instance. The Toolkit validates the
-     * arguments so we won't do that again here.
-     */
-    Task(size_t sizeX, size_t sizeY, size_t vectorSize, bool prefersDataAsOneRow,
-         const Restriction* restriction)
-        : mSizeX{sizeX},
-          mSizeY{sizeY},
-          mVectorSize{vectorSize},
-          mPrefersDataAsOneRow{prefersDataAsOneRow},
-          mRestriction{restriction} {}
-    virtual ~Task() {}
-
-    void setUsesSimd(bool uses) { mUsesSimd = uses; }
-
-    /**
-     * Divide the work into a number of tiles that can be distributed to the various threads.
-     * A tile will be a rectangular region. To be robust, we'll want to handle regular cases
-     * like 400x300 but also unusual ones like 1x120000, 120000x1, 1x1.
-     *
-     * We have a target size for the tiles, which corresponds roughly to how much data a thread
-     * will want to process before checking for more work. If the target is set too low, we'll spend
-     * more time in synchronization. If it's too large, some cores may not be used as efficiently.
-     *
-     * This method returns the number of tiles.
-     *
-     * @param targetTileSizeInBytes Target size. Values less than 1000 will be treated as 1000.
-     */
-    int setTiling(unsigned int targetTileSizeInBytes);
-
-    /**
-     * This is called by the TaskProcessor to instruct the task to process a tile.
-     *
-     * @param threadIndex The index of the thread that's processing the tile.
-     * @param tileIndex The index of the tile to process.
-     */
-    void processTile(unsigned int threadIndex, size_t tileIndex);
-
-   private:
-    /**
-     * Call to the derived class to process the data bounded by the rectangle specified
-     * by (startX, startY) and (endX, endY). The end values are EXCLUDED. This rectangle
-     * will be contained with the restriction, if one is provided.
-     */
-    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                             size_t endY) = 0;
-};
-
-/**
- * There's one instance of the task processor for the Toolkit. This class owns the thread pool,
- * and dispatches the tiles of work to the threads.
- */
-class TaskProcessor {
-    /**
-     * Does this processor support SIMD-like instructions?
-     */
-    const bool mUsesSimd;
-    /**
-     * The number of separate threads we'll spawn. It's one less than the number of threads that
-     * do the work as the client thread that starts the work will also be used.
-     */
-    const unsigned int mNumberOfPoolThreads;
-    /**
-     * Ensures that only one task is done at a time.
-     */
-    std::mutex mTaskMutex;
-    /**
-     * Ensures consistent access to the shared queue state.
-     */
-    std::mutex mQueueMutex;
-    /**
-     * The thread pool workers.
-     */
-    std::vector<std::thread> mPoolThreads;
-    /**
-     * The task being processed, if any. We only do one task at a time. We could create a queue
-     * of tasks but using a mTaskMutex is sufficient for now.
-     */
-    Task* mCurrentTask GUARDED_BY(mTaskMutex) = nullptr;
-    /**
-     * Signals that the mPoolThreads should terminate.
-     */
-    bool mStopThreads GUARDED_BY(mQueueMutex) = false;
-    /**
-     * Signaled when work is available or the mPoolThreads need to shut down. mStopThreads is used
-     * to distinguish between the two.
-     */
-    std::condition_variable mWorkAvailableOrStop;
-    /**
-     * Signaled when the work for the task is finished.
-     */
-    std::condition_variable mWorkIsFinished;
-    /**
-     * A user task, e.g. a blend or a blur, is split into a number of tiles. When a thread starts
-     * working on a new tile, it uses this count to identify which tile to work on. The tile
-     * number is sufficient to determine the boundaries of the data to process.
-     *
-     * The number of tiles left to process.
-     */
-    int mTilesNotYetStarted GUARDED_BY(mQueueMutex) = 0;
-    /**
-     * The number of tiles currently being processed. Must not be greater than
-     * mNumberOfPoolThreads + 1.
-     */
-    int mTilesInProcess GUARDED_BY(mQueueMutex) = 0;
-
-    /**
-     * Determines how we'll tile the work and signals the thread pool of available work.
-     *
-     * @param task The task to be performed.
-     */
-    void startWork(Task* task) REQUIRES(mTaskMutex);
-
-    /**
-     * Tells the thread to start processing work off the queue.
-     *
-     * The flag is used for prevent the main thread from blocking forever if the work is
-     * so trivial that the worker threads complete the work before the main thread calls this
-     * method.
-     *
-     * @param threadIndex The index number (0..mNumberOfPoolThreads) this thread will referred by.
-     * @param returnWhenNoWork If there's no work, return immediately.
-     */
-    void processTilesOfWork(int threadIndex, bool returnWhenNoWork);
-
-    /**
-     * Wait for the pool workers to complete the work on the current task.
-     */
-    void waitForPoolWorkersToComplete();
-
-   public:
-    /**
-     * Create the processor.
-     *
-     * @param numThreads The total number of threads to use. If 0, we'll decided based on system
-     * properties.
-     */
-    explicit TaskProcessor(unsigned int numThreads = 0);
-
-    ~TaskProcessor();
-
-    /**
-     * Do the specified task. Returns only after the task has been completed.
-     */
-    void doTask(Task* task);
-
-    /**
-     * Some Tasks need to allocate temporary storage for each worker thread.
-     * This provides the number of threads.
-     */
-    unsigned int getNumberOfThreads() const { return mNumberOfPoolThreads + 1; }
-};
-
-}  // namespace renderscript
-}  // namespace android
-
-#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_TASKPROCESSOR_H

diff --git a/toolkit/TestTaskProcessor.cpp b/toolkit/TestTaskProcessor.cpp
deleted file mode 100644
index 36a94f4..0000000
--- a/toolkit/TestTaskProcessor.cpp
+++ /dev/null

@@ -1,105 +0,0 @@
-#include <array>
-
-#include "TaskProcessor.h"
-
-/**
- * Sets all entries of the buffer to a value that depends on its coordinate and a delta.
- */
-class SimpleTask : public android::renderscript::Task {
-    uint8_t* mBuffer;
-    uint8_t mDelta;
-    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                             size_t endY);
-
-   public:
-    SimpleTask(uint8_t* buffer, size_t vectorSize, size_t sizeX, size_t sizeY, uint8_t delta)
-        : Task{sizeX, sizeY, vectorSize, false, nullptr}, mBuffer{buffer}, mDelta{delta} {}
-};
-
-/**
- * Create a new value that's a function of the x, y coordinates and a delta.
- */
-static uint8_t newValue(size_t x, size_t y, uint8_t delta) {
-    return (((x & 0xff) << 4) | (y & 0xff)) + delta;
-}
-
-void SimpleTask::processData(int /*threadIndex*/, size_t startX, size_t startY, size_t endX,
-                             size_t endY) {
-    for (size_t y = startY; y < endY; y++) {
-        for (size_t x = startX; x < endX; x++) {
-            size_t index = (y * mSizeX + x) * mVectorSize;
-            for (size_t i = 0; i < mVectorSize; i++) {
-                // Use add to make sure the opertion is only done once. This assumes
-                // the buffer starts set at 0.
-                mBuffer[index + i] += newValue(x, y, mDelta + i);
-            }
-        }
-    }
-}
-
-/**
- * Returns true if all the entries of the vector are the expected value.
- * Prints an error if not.
- */
-bool verifyAllTheSame(const std::vector<uint8_t>& buffer, size_t vectorSize, size_t sizeX,
-                      size_t sizeY, uint8_t delta) {
-    for (size_t y = 0; y < sizeY; y++) {
-        for (size_t x = 0; x < sizeX; x++) {
-            size_t index = (y * sizeX + x) * vectorSize;
-            for (size_t i = 0; i < vectorSize; i++) {
-                uint8_t expectedValue = newValue(x, y, delta + i);
-                if (buffer[index + i] != expectedValue) {
-                    printf("Test Error at %zu, %zu. Expected %u found %u instead\n", x, y,
-                           expectedValue, buffer[index + i]);
-                    return false;
-                }
-            }
-        }
-    }
-    return true;
-}
-
-/**
- * Create a buffer of the specified size, set each entry of that buffer
- * to the specified value using TaskProcessor, and verify the results.
- */
-void testOne(android::renderscript::TaskProcessor* processor, uint8_t delta, size_t vectorSize,
-             size_t sizeX, size_t sizeY) {
-    std::vector<uint8_t> buffer(sizeX * sizeY * vectorSize);
-
-    SimpleTask task{buffer.data(), vectorSize, sizeX, sizeY, delta};
-    processor->doTask(&task);
-
-    if (verifyAllTheSame(buffer, vectorSize, sizeX, sizeY, delta)) {
-        printf("Test %u: All good!\n", delta);
-    }
-}
-
-int main() {
-    std::vector<std::thread> testThreads;
-
-    // Test with multiple threads, to help find synchronization errors.
-    android::renderscript::TaskProcessor processorA(1);
-    android::renderscript::TaskProcessor processorB(4);
-    testThreads.emplace_back(testOne, &processorA, 1, 4, 30, 40);
-    testThreads.emplace_back(testOne, &processorB, 1, 4, 30, 40);
-    testThreads.emplace_back(testOne, &processorA, 2, 4, 800, 600);
-    testThreads.emplace_back(testOne, &processorB, 2, 4, 800, 600);
-    testThreads.emplace_back(testOne, &processorA, 3, 1, 123, 47);
-    testThreads.emplace_back(testOne, &processorB, 3, 1, 123, 47);
-    testThreads.emplace_back(testOne, &processorA, 5, 2, 5000, 8000);
-    testThreads.emplace_back(testOne, &processorB, 5, 2, 5000, 8000);
-    testThreads.emplace_back(testOne, &processorA, 6, 3, 26000, 1);
-    testThreads.emplace_back(testOne, &processorB, 6, 3, 26000, 1);
-    testThreads.emplace_back(testOne, &processorA, 7, 4, 1, 26000);
-    testThreads.emplace_back(testOne, &processorB, 7, 4, 1, 26000);
-    testThreads.emplace_back(testOne, &processorA, 8, 4, 1000, 1000);
-    testThreads.emplace_back(testOne, &processorB, 8, 4, 1000, 1000);
-    testThreads.emplace_back(testOne, &processorA, 9, 1, 1, 1);
-    testThreads.emplace_back(testOne, &processorB, 9, 1, 1, 1);
-
-    for (auto& thread : testThreads) {
-        thread.join();
-    }
-    return 0;
-}

diff --git a/toolkit/Utils.cpp b/toolkit/Utils.cpp
deleted file mode 100644
index 8ec9fbe..0000000
--- a/toolkit/Utils.cpp
+++ /dev/null

@@ -1,81 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Utils.h"
-
-#include <cpu-features.h>
-
-#include "RenderScriptToolkit.h"
-
-namespace android {
-namespace renderscript {
-
-#define LOG_TAG "renderscript.toolkit.Utils"
-
-bool cpuSupportsSimd() {
-    AndroidCpuFamily family = android_getCpuFamily();
-    uint64_t features = android_getCpuFeatures();
-
-    if (family == ANDROID_CPU_FAMILY_ARM && (features & ANDROID_CPU_ARM_FEATURE_NEON)) {
-        // ALOGI("Arm with Neon");
-        return true;
-    } else if (family == ANDROID_CPU_FAMILY_ARM64 && (features & ANDROID_CPU_ARM64_FEATURE_ASIMD)) {
-        // ALOGI("Arm64 with ASIMD");
-        return true;
-    } else if ((family == ANDROID_CPU_FAMILY_X86 || family == ANDROID_CPU_FAMILY_X86_64) &&
-               (features & ANDROID_CPU_X86_FEATURE_SSSE3)) {
-        // ALOGI("x86* with SSE3");
-        return true;
-    }
-    // ALOGI("Not simd");
-    return false;
-}
-
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
-bool validRestriction(const char* tag, size_t sizeX, size_t sizeY, const Restriction* restriction) {
-    if (restriction == nullptr) {
-        return true;
-    }
-    if (restriction->startX >= sizeX || restriction->endX > sizeX) {
-        ALOGE("%s. sizeX should be greater than restriction->startX and greater or equal to "
-              "restriction->endX. %zu, %zu, and %zu were provided respectively.",
-              tag, sizeX, restriction->startX, restriction->endY);
-        return false;
-    }
-    if (restriction->startY >= sizeY && restriction->endY > sizeY) {
-        ALOGE("%s. sizeY should be greater than restriction->startY and greater or equal to "
-              "restriction->endY. %zu, %zu, and %zu were provided respectively.",
-              tag, sizeY, restriction->startY, restriction->endY);
-        return false;
-    }
-    if (restriction->startX >= restriction->endX) {
-        ALOGE("%s. Restriction startX should be less than endX. "
-              "%zu and %zu were provided respectively.",
-              tag, restriction->startX, restriction->endX);
-        return false;
-    }
-    if (restriction->startY >= restriction->endY) {
-        ALOGE("%s. Restriction startY should be less than endY. "
-              "%zu and %zu were provided respectively.",
-              tag, restriction->startY, restriction->endY);
-        return false;
-    }
-    return true;
-}
-#endif
-
-}  // namespace renderscript
-}  // namespace android

diff --git a/toolkit/Utils.h b/toolkit/Utils.h
deleted file mode 100644
index ff9eb43..0000000
--- a/toolkit/Utils.h
+++ /dev/null

@@ -1,155 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ANDROID_RENDERSCRIPT_TOOLKIT_UTILS_H
-#define ANDROID_RENDERSCRIPT_TOOLKIT_UTILS_H
-
-#include <android/log.h>
-
-namespace android {
-namespace renderscript {
-
-/* The Toolkit does not support floating point buffers but the original RenderScript Intrinsics
- * did for some operations. That code was preserved and protected by
- * ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT.
- */
-// TODO: On final packaging, decide whether this should be define in the build file, and for which
-// config. #define ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
-
-/* If we release the Toolkit as a C++ API, we'll want to enable validation at the C++ level
- * by uncommenting this define.
- *
- * If we only have a Java/Kotlin API, the Kotlin layer does validation. We don't need to duplicate
- * this effort.
- */
-#define ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
-
-#define ALOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
-#define ALOGW(...) __android_log_print(ANDROID_LOG_WARN, LOG_TAG, __VA_ARGS__)
-#define ALOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
-
-using uchar = unsigned char;
-using uint = unsigned int;
-using ushort = unsigned short;
-
-using uint8_t = uchar;
-using uint16_t = ushort;
-using uint32_t = uint;
-
-typedef float float2 __attribute__((ext_vector_type(2)));
-typedef float float3 __attribute__((ext_vector_type(3)));
-typedef float float4 __attribute__((ext_vector_type(4)));
-typedef uchar uchar2 __attribute__((ext_vector_type(2)));
-typedef uchar uchar3 __attribute__((ext_vector_type(3)));
-typedef uchar uchar4 __attribute__((ext_vector_type(4)));
-typedef ushort ushort2 __attribute__((ext_vector_type(2)));
-typedef ushort ushort3 __attribute__((ext_vector_type(3)));
-typedef ushort ushort4 __attribute__((ext_vector_type(4)));
-typedef uint uint2 __attribute__((ext_vector_type(2)));
-typedef uint uint3 __attribute__((ext_vector_type(3)));
-typedef uint uint4 __attribute__((ext_vector_type(4)));
-typedef short short2 __attribute__((ext_vector_type(2)));
-typedef short short3 __attribute__((ext_vector_type(3)));
-typedef short short4 __attribute__((ext_vector_type(4)));
-typedef int int2 __attribute__((ext_vector_type(2)));
-typedef int int3 __attribute__((ext_vector_type(3)));
-typedef int int4 __attribute__((ext_vector_type(4)));
-
-template <typename TO, typename TI>
-inline TO convert(TI i) {
-    // assert(i.x >= 0 && i.y >= 0 && i.z >= 0 && i.w >= 0);
-    // assert(i.x <= 255 && i.y <= 255 && i.z <= 255 && i.w <= 255);
-    return __builtin_convertvector(i, TO);
-}
-
-template <>
-inline uchar convert(float i) {
-    // assert(i.x >= 0 && i.y >= 0 && i.z >= 0 && i.w >= 0);
-    // assert(i.x <= 255 && i.y <= 255 && i.z <= 255 && i.w <= 255);
-    return (uchar)i;
-}
-
-template <>
-inline float convert(uchar i) {
-    // assert(i.x >= 0 && i.y >= 0 && i.z >= 0 && i.w >= 0);
-    // assert(i.x <= 255 && i.y <= 255 && i.z <= 255 && i.w <= 255);
-    return (float)i;
-}
-
-inline int4 clamp(int4 amount, int low, int high) {
-    int4 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
-    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
-    return r;
-}
-
-inline float4 clamp(float4 amount, float low, float high) {
-    float4 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
-    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
-    return r;
-}
-
-inline int2 clamp(int2 amount, int low, int high) {
-    int2 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    return r;
-}
-
-inline float2 clamp(float2 amount, float low, float high) {
-    float2 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    return r;
-}
-
-inline int clamp(int amount, int low, int high) {
-    return amount < low ? low : (amount > high ? high : amount);
-}
-
-inline float clamp(float amount, float low, float high) {
-    return amount < low ? low : (amount > high ? high : amount);
-}
-
-#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
-struct Restriction;
-
-bool validRestriction(const char* tag, size_t sizeX, size_t sizeY, const Restriction* restriction);
-#endif
-
-/**
- * Returns true if the processor we're running on supports the SIMD instructions that are
- * used in our assembly code.
- */
-bool cpuSupportsSimd();
-
-inline size_t divideRoundingUp(size_t a, size_t b) {
-    return a / b + (a % b == 0 ? 0 : 1);
-}
-
-inline size_t paddedSize(size_t size) {
-    return size == 3 ? 4 : size;
-}
-
-}  // namespace renderscript
-}  // namespace android
-
-#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_UTILS_H

diff --git a/toolkit/YuvToRgb.cpp b/toolkit/YuvToRgb.cpp
deleted file mode 100644
index 2da0f5c..0000000
--- a/toolkit/YuvToRgb.cpp
+++ /dev/null

@@ -1,241 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-
-#include "RenderScriptToolkit.h"
-#include "TaskProcessor.h"
-#include "Utils.h"
-
-#define LOG_TAG "renderscript.toolkit.YuvToRgb"
-
-namespace android {
-namespace renderscript {
-
-inline size_t roundUpTo16(size_t val) {
-    return (val + 15) & ~15;
-}
-
-class YuvToRgbTask : public Task {
-    uchar4* mOut;
-    size_t mCstep;
-    size_t mStrideY;
-    size_t mStrideU;
-    size_t mStrideV;
-    const uchar* mInY;
-    const uchar* mInU;
-    const uchar* mInV;
-
-    void kernel(uchar4* out, uint32_t xstart, uint32_t xend, uint32_t currentY);
-    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
-    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
-                             size_t endY) override;
-
-   public:
-    YuvToRgbTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
-                 RenderScriptToolkit::YuvFormat format)
-        : Task{sizeX, sizeY, 4, false, nullptr}, mOut{reinterpret_cast<uchar4*>(output)} {
-        switch (format) {
-            case RenderScriptToolkit::YuvFormat::NV21:
-                mCstep = 2;
-                mStrideY = sizeX;
-                mStrideU = mStrideY;
-                mStrideV = mStrideY;
-                mInY = reinterpret_cast<const uchar*>(input);
-                mInV = reinterpret_cast<const uchar*>(input + mStrideY * sizeY);
-                mInU = mInV + 1;
-                break;
-            case RenderScriptToolkit::YuvFormat::YV12:
-                mCstep = 1;
-                mStrideY = roundUpTo16(sizeX);
-                mStrideU = roundUpTo16(mStrideY >> 1);
-                mStrideV = mStrideU;
-                mInY = reinterpret_cast<const uchar*>(input);
-                mInU = reinterpret_cast<const uchar*>(input + mStrideY * sizeY);
-                mInV = mInU + mStrideV * sizeY / 2;
-                break;
-        }
-    }
-};
-
-void YuvToRgbTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
-                               size_t endY) {
-    for (size_t y = startY; y < endY; y++) {
-        size_t offset = mSizeX * y + startX;
-        uchar4* out = mOut + offset;
-        kernel(out, startX, endX, y);
-    }
-}
-
-static uchar4 rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v) {
-    int16_t Y = ((int16_t)y) - 16;
-    int16_t U = ((int16_t)u) - 128;
-    int16_t V = ((int16_t)v) - 128;
-
-    short4 p;
-    p.x = (Y * 298 + V * 409 + 128) >> 8;
-    p.y = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
-    p.z = (Y * 298 + U * 516 + 128) >> 8;
-    p.w = 255;
-    if(p.x < 0) {
-        p.x = 0;
-    }
-    if(p.x > 255) {
-        p.x = 255;
-    }
-    if(p.y < 0) {
-        p.y = 0;
-    }
-    if(p.y > 255) {
-        p.y = 255;
-    }
-    if(p.z < 0) {
-        p.z = 0;
-    }
-    if(p.z > 255) {
-        p.z = 255;
-    }
-
-    return (uchar4){static_cast<uchar>(p.x), static_cast<uchar>(p.y),
-                    static_cast<uchar>(p.z), static_cast<uchar>(p.w)};
-}
-
-extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart,
-                                  size_t xend);
-extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart,
-                                   size_t xend);
-extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v,
-                                   size_t xstart, size_t xend);
-
-void YuvToRgbTask::kernel(uchar4 *out, uint32_t xstart, uint32_t xend, uint32_t currentY) {
-    //ALOGI("kernel out %p, xstart=%u, xend=%u, currentY=%u", out, xstart, xend, currentY);
-
-    const uchar *y = mInY + (currentY * mStrideY);
-    const uchar *v = mInV + ((currentY >> 1) * mStrideV);
-    const uchar *u = mInU + ((currentY >> 1) * mStrideU);
-
-    //ALOGI("pinY %p, pinV %p, pinU %p", pinY, pinV, pinU);
-
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-    /*
-    ALOGE("pinY, %p, Y, %p, currentY, %d, strideY, %zu", pinY, y, currentY, mStrideY);
-    ALOGE("pinU, %p, U, %p, currentY, %d, strideU, %zu", pinU, u, currentY, mStrideU);
-    ALOGE("pinV, %p, V, %p, currentY, %d, strideV, %zu", pinV, v, currentY, mStrideV);
-    ALOGE("dimX, %d, dimY, %d", cp->alloc->mHal.drvState.lod[0].dimX,
-          cp->alloc->mHal.drvState.lod[0].dimY);
-    ALOGE("info->dim.x, %d, info->dim.y, %d", info->dim.x, info->dim.y);
-    uchar* pinY = (uchar*)mInY;
-    uchar* pinU = (uchar*)mInU;
-    uchar* pinV = (uchar*)mInV;
-    ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
-          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
-          pinY, pinY[0], pinY[1], pinY[2], pinY[3], pinY[4], pinY[5], pinY[6], pinY[7], pinY[8],
-          pinY[9], pinY[10], pinY[11], pinY[12], pinY[13], pinY[14], pinY[15]);
-    ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
-          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
-          pinY, pinY[16], pinY[17], pinY[18], pinY[19], pinY[20], pinY[21], pinY[22], pinY[23],
-          pinY[24], pinY[25], pinY[26], pinY[27], pinY[28], pinY[29], pinY[30], pinY[31]);
-    ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
-          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
-          pinY, pinY[32], pinY[33], pinY[34], pinY[35], pinY[36], pinY[37], pinY[38], pinY[39],
-          pinY[40], pinY[41], pinY[42], pinY[43], pinY[44], pinY[45], pinY[46], pinY[47]);
-
-    ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
-          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
-          pinU, pinU[0], pinU[1], pinU[2], pinU[3], pinU[4], pinU[5], pinU[6], pinU[7], pinU[8],
-          pinU[9], pinU[10], pinU[11], pinU[12], pinU[13], pinU[14], pinU[15]);
-    ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
-          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
-          pinU, pinU[16], pinU[17], pinU[18], pinU[19], pinU[20], pinU[21], pinU[22], pinU[23],
-          pinU[24], pinU[25], pinU[26], pinU[27], pinU[28], pinU[29], pinU[30], pinU[31]);
-    ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
-          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
-          pinU, pinU[32], pinU[33], pinU[34], pinU[35], pinU[36], pinU[37], pinU[38], pinU[39],
-          pinU[40], pinU[41], pinU[42], pinU[43], pinU[44], pinU[45], pinU[46], pinU[47]);
-
-    ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
-          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
-          pinV, pinV[0], pinV[1], pinV[2], pinV[3], pinV[4], pinV[5], pinV[6], pinV[7], pinV[8],
-          pinV[9], pinV[10], pinV[11], pinV[12], pinV[13], pinV[14], pinV[15]);
-    ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
-          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
-          pinV, pinV[16], pinV[17], pinV[18], pinV[19], pinV[20], pinV[21], pinV[22], pinV[23],
-          pinV[24], pinV[25], pinV[26], pinV[27], pinV[28], pinV[29], pinV[30], pinV[31]);
-    ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
-          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
-          pinV, pinV[32], pinV[33], pinV[34], pinV[35], pinV[36], pinV[37], pinV[38], pinV[39],
-          pinV[40], pinV[41], pinV[42], pinV[43], pinV[44], pinV[45], pinV[46], pinV[47]);
-    */
-
-    /* If we start on an odd pixel then deal with it here and bump things along
-     * so that subsequent code can carry on with even-odd pairing assumptions.
-     */
-    if((x1 & 1) && (x2 > x1)) {
-        int cx = (x1 >> 1) * mCstep;
-        *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
-        out++;
-        x1++;
-    }
-
-#if defined(ARCH_ARM_USE_INTRINSICS)
-    if((x2 > x1) && mUsesSimd) {
-        int32_t len = x2 - x1;
-        if (mCstep == 1) {
-            rsdIntrinsicYuv2_K(out, y, u, v, x1, x2);
-            x1 += len;
-            out += len;
-        } else if (mCstep == 2) {
-            // Check for proper interleave
-            intptr_t ipu = (intptr_t)u;
-            intptr_t ipv = (intptr_t)v;
-
-            if (ipu == (ipv + 1)) {
-                rsdIntrinsicYuv_K(out, y, v, x1, x2);
-                x1 += len;
-                out += len;
-            } else if (ipu == (ipv - 1)) {
-                rsdIntrinsicYuvR_K(out, y, u, x1, x2);
-                x1 += len;
-                out += len;
-            }
-        }
-    }
-#endif
-
-    if(x2 > x1) {
-       // ALOGE("y %i  %i  %i", currentY, x1, x2);
-        while(x1 < x2) {
-            int cx = (x1 >> 1) * mCstep;
-            *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
-            out++;
-            x1++;
-            *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
-            out++;
-            x1++;
-        }
-    }
-}
-
-void RenderScriptToolkit::yuvToRgb(const uint8_t* input, uint8_t* output, size_t sizeX,
-                                   size_t sizeY, YuvFormat format) {
-    YuvToRgbTask task(input, output, sizeX, sizeY, format);
-    processor->doTask(&task);
-}
-
-}  // namespace renderscript
-}  // namespace android

diff --git a/toolkit/YuvToRgb_advsimd.S b/toolkit/YuvToRgb_advsimd.S
deleted file mode 100644
index bb4b7ae..0000000
--- a/toolkit/YuvToRgb_advsimd.S
+++ /dev/null

@@ -1,377 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
-#define END(f) .size f, .-f;
-
-/* Perform the actual YuvToRGB conversion in a macro, from register to
- * register.  This macro will be called from within several different wrapper
- * variants for different data layouts.  Y data starts with the even and odd
- * bytes split into the low parts of v8 and v9 respectively.  U and V are in
- * v10 and v11.  Working constants are pre-loaded into v24-v31, and v3 and v7
- * are pre-loaded with a constant 0xff alpha channel.
- *
- * The complicated arithmetic is the result of refactoring the original
- * equations to avoid 16-bit overflow without losing any precision.
- */
-.macro yuvkern, regu=v10, regv=v11
-        /* v0   out R_lo / even R_lo accumulator
-         * v1   out G_lo / even G_lo accumulator
-         * v2   out B_lo / even B_lo accumulator
-         * v3   out A_lo / const 0xff*ff
-         * v4   out R_hi / even R_hi accumulator
-         * v5   out G_hi / even G_hi accumulator
-         * v6   out B_hi / even B_hi accumulator
-         * v7   out A_hi / const 0xff*ff
-         * v8   even Y   / G_lo luma tmp
-         * v9   odd Y    / G_lo luma tmp
-         * \regu in U
-         * \regv in V
-         * v12  R_lo luma tmp
-         * v13  B_lo luma tmp
-         * v14  R_hi luma tmp
-         * v15  B_hi luma tmp
-         * v16  odd R_lo accumulator
-         * v17  odd G_lo accumulator
-         * v18  odd B_lo accumulator
-         * v19  multiplier extra bits low
-         * v20  odd R_hi accumulator
-         * v21  odd G_hi accumulator
-         * v22  odd B_hi accumulator
-         * v23  multiplier extra bits high
-         * v24  constant 149
-         * v25  constant 50
-         * v26  constant 104
-         * v27  constant 204
-         * v28  constant 254
-         * v29  constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-         * v30  constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-         * v31  constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1)
-         */
-
-        umull       v1.8h,  v8.8b,  v24.8b      // g0 = y0 * 149
-        umull       v17.8h, v9.8b,  v24.8b      // g1 = y1 * 149
-        umull2      v5.8h,  v8.16b, v24.16b     // g0_hi = y0_hi * 149
-        umull2      v21.8h, v9.16b, v24.16b     // g1_hi = y1_hi * 149
-
-        umull       v8.8h, \regu\().8b, v25.8b     // g2 = u * 50 + v * 104
-        umlal       v8.8h, \regv\().8b, v26.8b
-        umull2      v9.8h, \regu\().16b, v25.16b   // g2_hi = u_hi * 50 + v_hi * 104
-        umlal2      v9.8h, \regv\().16b, v26.16b
-
-        ushr        v19.16b, \regv\().16b, #1
-        uaddw       v0.8h,  v1.8h,  v19.8b      // r0 = g0 + (v >> 1)
-        uaddw       v16.8h, v17.8h, v19.8b      // r1 = g1 + (v >> 1)
-
-        uaddw2      v4.8h,  v5.8h,  v19.16b     // r0_hi = g0_hi + (v_hi >> 1)
-        uaddw2      v20.8h, v21.8h, v19.16b     // r1_hi = g1_hi + (v_hi >> 1)
-
-        ushll       v19.8h, \regu\().8b,  #2
-        ushll2      v23.8h, \regu\().16b, #2
-        add         v2.8h,  v1.8h,  v19.8h      // b0 = g0 + (u << 2)
-        add         v18.8h, v17.8h, v19.8h      // b1 = g1 + (u << 2)
-
-        add         v6.8h,  v5.8h,  v23.8h      // b0_hi = g0_hi + (u_hi << 2)
-        add         v22.8h, v21.8h, v23.8h      // b1_hi = g1_hi + (u_hi << 2)
-
-        umull       v12.8h, \regv\().8b, v27.8b    // r2 = v * 204
-        umull       v13.8h, \regu\().8b, v28.8b    // b2 = u * 254
-
-        umull2      v14.8h, \regv\().16b, v27.16b  // r2_hi = v_hi * 204
-        umull2      v15.8h, \regu\().16b, v28.16b  // b2_hi = u_hi * 254
-
-        uhadd       v0.8h,  v0.8h,  v12.8h      // r0 = (r0 + r2) >> 1
-        uhadd       v16.8h, v16.8h, v12.8h      // r1 = (r1 + r2) >> 1
-        uqadd       v1.8h,  v1.8h,  v30.8h      // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        uqadd       v17.8h, v17.8h, v30.8h      // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        uhadd       v2.8h,  v2.8h,  v13.8h      // b0 = (b0 + b2) >> 1
-        uhadd       v18.8h, v18.8h, v13.8h      // b1 = (b1 + b2) >> 1
-
-        uhadd       v4.8h,  v4.8h,  v14.8h      // r0_hi = (r0_hi + r2_hi) >> 1
-        uhadd       v20.8h, v20.8h, v14.8h      // r1_hi = (r1_hi + r2_hi) >> 1
-        uqadd       v5.8h,  v5.8h,  v30.8h      // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        uqadd       v21.8h, v21.8h, v30.8h      // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        uhadd       v6.8h,  v6.8h,  v15.8h      // b0_hi = (b0_hi + b2_hi) >> 1
-        uhadd       v22.8h, v22.8h, v15.8h      // b1_hi = (b1_hi + b2_hi) >> 1
-
-        uqsub       v0.8h,  v0.8h,  v29.8h      // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        uqsub       v16.8h, v16.8h, v29.8h      // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        uqsub       v1.8h,  v1.8h,  v8.8h       // g0 = satu16(g0 - g2)
-        uqsub       v17.8h, v17.8h, v8.8h       // g1 = satu16(g1 - g2)
-        uqsub       v2.8h,  v2.8h,  v31.8h      // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
-        uqsub       v18.8h, v18.8h, v31.8h      // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
-
-        uqsub       v4.8h,  v4.8h,  v29.8h      // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        uqsub       v20.8h, v20.8h, v29.8h      // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        uqsub       v5.8h,  v5.8h,  v9.8h       // g0_hi = satu16(g0_hi - g2_hi)
-        uqsub       v21.8h, v21.8h, v9.8h       // g1_hi = satu16(g1_hi - g2_hi)
-        uqsub       v6.8h,  v6.8h,  v31.8h      // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
-        uqsub       v22.8h, v22.8h, v31.8h      // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
-
-        uqrshrn     v0.8b,  v0.8h,  #6
-        uqrshrn     v16.8b, v16.8h, #6
-        uqrshrn     v1.8b,  v1.8h,  #7
-        uqrshrn     v17.8b, v17.8h, #7
-        uqrshrn     v2.8b,  v2.8h,  #6
-        uqrshrn     v18.8b, v18.8h, #6
-
-        uqrshrn     v4.8b,  v4.8h,  #6
-        uqrshrn     v20.8b, v20.8h, #6
-        uqrshrn     v5.8b,  v5.8h,  #7
-        uqrshrn     v21.8b, v21.8h, #7
-        uqrshrn     v6.8b,  v6.8h,  #6
-        uqrshrn     v22.8b, v22.8h, #6
-
-        zip1        v0.16b, v0.16b, v16.16b
-        zip1        v1.16b, v1.16b, v17.16b
-        zip1        v2.16b, v2.16b, v18.16b
-
-        zip1        v4.16b, v4.16b, v20.16b
-        zip1        v5.16b, v5.16b, v21.16b
-        zip1        v6.16b, v6.16b, v22.16b
-.endm
-
-/* Define the wrapper code which will load and store the data, iterate the
- * correct number of times, and safely handle the remainder at the end of the
- * loop.  Some sections of code are switched out depending on the data packing
- * being handled.
- */
-.macro wrap_line kernel, interleaved=0, swapuv=0
-        movi        v24.16b, #149
-        movi        v25.16b, #50
-        movi        v26.16b, #104
-        movi        v27.16b, #204
-        movi        v28.16b, #254
-        mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        dup         v29.8h, w5
-        mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        dup         v30.8h, w5
-        mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
-        dup         v31.8h, w5
-
-        movi        v3.16b, #0xff
-        movi        v7.16b, #0xff
-
-        subs        x2, x2, #32
-        bhs         1f
-        b           2f
-
-        .align 4
-1:      ld2         {v8.16b,v9.16b}, [x1], #32
-  .if \interleaved
-        ld2         {v10.16b,v11.16b}, [x3], #32
-  .else
-        ld1         {v10.16b}, [x3], #16
-        ld1         {v11.16b}, [x4], #16
-  .endif
-
-  .if \swapuv
-        \kernel regu=v11, regv=v10
-  .else
-        \kernel
-  .endif
-
-        subs        x2, x2, #32
-
-        st4         {v0.16b - v3.16b}, [x0], #64
-        st4         {v4.16b - v7.16b}, [x0], #64
-
-        bhs         1b
-
-2:      adds        x2, x2, #32
-        beq         2f
-
-        /* To handle the tail portion of the data (something less than 32
-         * bytes) load small power-of-two chunks into working registers.  It
-         * doesn't matter where they end up in the register; the same process
-         * will store them back out using the same positions and the
-         * interaction between neighbouring pixels is constrained to odd
-         * boundaries where the load operations don't interfere.
-         */
-        movi        v8.8b, #0
-        movi        v9.8b, #0
-        movi        v10.8b, #0
-        movi        v11.8b, #0
-
-        tbz         x2, #4, 1f
-        ld1         {v9.16b}, [x1], #16
-  .if \interleaved
-        ld1         {v11.16b}, [x3], #16
-  .else
-        ld1         {v10.d}[1], [x3], #8
-        ld1         {v11.d}[1], [x4], #8
-  .endif
-1:      tbz         x2, #3, 1f
-        ld1         {v8.d}[1], [x1], #8
-  .if \interleaved
-        ld1         {v10.d}[1], [x3], #8
-  .else
-        ld1         {v10.s}[1], [x3], #4
-        ld1         {v11.s}[1], [x4], #4
-  .endif
-1:      tbz         x2, #2, 1f
-        ld1         {v8.s}[1], [x1], #4
-  .if \interleaved
-        ld1         {v10.s}[1], [x3], #4
-  .else
-        ld1         {v10.h}[1], [x3], #2
-        ld1         {v11.h}[1], [x4], #2
-  .endif
-1:      tbz         x2, #1, 1f
-        ld1         {v8.h}[1], [x1], #2
-  .if \interleaved
-        ld1         {v10.h}[1], [x3], #2
-  .else
-        ld1         {v10.b}[1], [x3], #1
-        ld1         {v11.b}[1], [x4], #1
-  .endif
-1:      tbz         x2, #0, 1f
-        ld1         {v8.b}[1], [x1], #1
-  .if \interleaved
-        ld1         {v10.h}[0], [x3], #2
-  .else
-        ld1         {v10.b}[0], [x3], #1
-        ld1         {v11.b}[0], [x4], #1
-  .endif
-
-        /* One small impediment in the process above is that some of the load
-         * operations can't perform byte-wise structure deinterleaving at the
-         * same time as loading only part of a register.  So the data is loaded
-         * linearly and unpacked manually at this point if necessary.
-         */
-1:      mov         v12.16b, v8.16b
-        uzp1        v8.16b, v12.16b, v9.16b
-        uzp2        v9.16b, v12.16b, v9.16b
-  .if \interleaved
-        mov         v12.16b, v10.16b
-        uzp1        v10.16b, v12.16b, v11.16b
-        uzp2        v11.16b, v12.16b, v11.16b
-  .endif
-
-  .if \swapuv
-        \kernel regu=v11, regv=v10
-  .else
-        \kernel
-  .endif
-
-        /* As above but with the output; structured stores for partial vectors
-         * aren't available, so the data is re-packed first and stored linearly.
-         */
-        zip1        v16.16b, v0.16b, v2.16b
-        zip2        v18.16b, v0.16b, v2.16b
-        zip1        v17.16b, v1.16b, v3.16b
-        zip2        v19.16b, v1.16b, v3.16b
-        zip1        v0.16b, v16.16b, v17.16b
-        zip2        v1.16b, v16.16b, v17.16b
-        zip1        v2.16b, v18.16b, v19.16b
-        zip2        v3.16b, v18.16b, v19.16b
-
-        /* Luckily v4-v7 don't need to be unzipped because the complete set of
-         * four and can be stored using st4. */
-
-        tbz         x2, #4, 1f
-        st4         {v4.16b - v7.16b}, [x0], #64
-1:      tbz         x2, #3, 1f
-        st1         {v2.16b,v3.16b}, [x0], #32
-1:      tbz         x2, #2, 1f
-        st1         {v1.16b}, [x0], #16
-1:      tbz         x2, #1, 1f
-        st1         {v0.d}[1], [x0], #8
-1:      tbz         x2, #0, 2f
-        st1         {v0.s}[1], [x0], #4
-2:
-.endm
-
-
-/*  void rsdIntrinsicYuv2_K(
- *          void *out,          // x0
- *          void const *yin,    // x1
- *          void const *uin,    // x2
- *          void const *vin,    // x3
- *          size_t xstart,      // x4
- *          size_t xend);       // x5
- */
-ENTRY(rsdIntrinsicYuv2_K)
-        lsr         x6, x4, #1
-        add         x0, x0, x4, LSL #2
-        add         x1, x1, x4
-        add         x4, x3, x6
-        add         x3, x2, x6
-        sub         x2, x5, x6, LSL #1
-
-        sub         x6, sp, #32
-        sub         sp, sp, #64
-        st1         {v8.1d - v11.1d}, [sp]
-        st1         {v12.1d - v15.1d}, [x6]
-
-        wrap_line yuvkern, 0
-
-        ld1         {v8.1d - v11.1d}, [sp], #32
-        ld1         {v12.1d - v15.1d}, [sp], #32
-        ret
-END(rsdIntrinsicYuv2_K)
-
-/*  void rsdIntrinsicYuv_K(
- *          void *out,          // x0
- *          void const *yin,    // x1
- *          void const *uvin,   // x2
- *          size_t xstart,      // x3
- *          size_t xend);       // x4
- */
-ENTRY(rsdIntrinsicYuv_K)
-        bic         x5, x3, #1
-        add         x0, x0, x5, LSL #2
-        add         x1, x1, x5
-        add         x3, x2, x5
-        sub         x2, x4, x5
-
-        sub         x5, sp, #32
-        sub         sp, sp, #64
-        st1         {v8.1d - v11.1d}, [sp]
-        st1         {v12.1d - v15.1d}, [x5]
-
-        wrap_line yuvkern, 1, 1
-
-        ld1         {v8.1d - v11.1d}, [sp], #32
-        ld1         {v12.1d - v15.1d}, [sp], #32
-        ret
-END(rsdIntrinsicYuv_K)
-
-/*  void rsdIntrinsicYuvR_K(
- *          void *out,          // x0
- *          void const *yin,    // x1
- *          void const *uvin,   // x2
- *          size_t xstart,      // x3
- *          size_t xend);       // x4
- */
-ENTRY(rsdIntrinsicYuvR_K)
-        bic         x5, x3, #1
-        add         x0, x0, x5, LSL #2
-        add         x1, x1, x5
-        add         x3, x2, x5
-        sub         x2, x4, x5
-
-        sub         x5, sp, #32
-        sub         sp, sp, #64
-        st1         {v8.1d - v11.1d}, [sp]
-        st1         {v12.1d - v15.1d}, [x5]
-
-        wrap_line yuvkern, 1
-
-        ld1         {v8.1d - v11.1d}, [sp], #32
-        ld1         {v12.1d - v15.1d}, [sp], #32
-        ret
-END(rsdIntrinsicYuvR_K)

diff --git a/toolkit/YuvToRgb_neon.S b/toolkit/YuvToRgb_neon.S
deleted file mode 100644
index 5c3bce4..0000000
--- a/toolkit/YuvToRgb_neon.S
+++ /dev/null

@@ -1,298 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
-#define END(f) .fnend; .size f, .-f;
-
-.eabi_attribute 25,1 @Tag_ABI_align8_preserved
-.arm
-
-/* Perform the actual YuvToRGB conversion in a macro, from register to
- * register.  This macro will be called from within several different wrapper
- * variants for different data layouts.  Y data starts in q8, but with the even
- * and odd bytes split into d16 and d17 respectively.  U and V are in d20
- * and d21.  Working constants are pre-loaded into q13-q15, and q3 is
- * pre-loaded with a constant 0xff alpha channel.
- *
- * The complicated arithmetic is the result of refactoring the original
- * equations to avoid 16-bit overflow without losing any precision.
- */
-.macro yuvkern
-        vmov.i8     d15, #149
-
-        vmull.u8    q1, d16, d15        // g0 = y0 * 149
-        vmull.u8    q5, d17, d15        // g1 = y1 * 149
-
-        vmov.i8     d14, #50
-        vmov.i8     d15, #104
-        vmull.u8    q8, d20, d14        // g2 = u * 50 + v * 104
-        vmlal.u8    q8, d21, d15
-
-        vshr.u8     d14, d21, #1
-        vaddw.u8    q0, q1, d14         // r0 = y0 * 149 + (v >> 1)
-        vaddw.u8    q4, q5, d14         // r1 = y1 * 149 + (v >> 1)
-
-        vshll.u8    q7, d20, #2
-        vadd.u16    q2, q1, q7          // b0 = y0 * 149 + (u << 2)
-        vadd.u16    q6, q5, q7          // b1 = y1 * 149 + (u << 2)
-
-        vmov.i8     d14, #204
-        vmov.i8     d15, #254
-        vmull.u8    q11, d21, d14       // r2 = v * 204
-        vmull.u8    q12, d20, d15       // b2 = u * 254
-
-        vhadd.u16   q0, q11             // r0 = (r0 + r2) >> 1
-        vhadd.u16   q4, q11             // r1 = (r1 + r2) >> 1
-        vqadd.u16   q1, q14             // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        vqadd.u16   q5, q14             // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        vhadd.u16   q2, q12             // b0 = (b0 + b2) >> 1
-        vhadd.u16   q6, q12             // b1 = (b1 + b2) >> 1
-
-        vqsub.u16   q0, q13             // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        vqsub.u16   q4, q13             // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        vqsub.u16   q1, q8              // g0 = satu16(g0 - g2)
-        vqsub.u16   q5, q8              // g1 = satu16(g1 - g2)
-        vqsub.u16   q2, q15             // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
-        vqsub.u16   q6, q15             // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
-
-        vqrshrn.u16 d0, q0, #6
-        vqrshrn.u16 d1, q1, #7
-        vqrshrn.u16 d2, q4, #6
-        vqrshrn.u16 d3, q5, #7
-        vqrshrn.u16 d4, q2, #6
-        vqrshrn.u16 d5, q6, #6
-
-        vzip.u8     q0, q1
-        vzip.u8     d4, d5
-.endm
-
-/* Define the wrapper code which will load and store the data, iterate the
- * correct number of times, and safely handle the remainder at the end of the
- * loop.  Some sections of code are switched out depending on the data packing
- * being handled.
- */
-.macro wrap_line kernel, interleaved=0, swapuv=0
-
-        movw        r5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        vdup.i16    q13, r5
-        movw        r5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        vdup.i16    q14, r5
-        movw        r5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
-        vdup.i16    q15, r5
-
-        vmov.i8     q3, #0xff
-
-        subs        r2, #16
-        bhs         1f
-        b           2f
-
-        .align 4
-1:      vld2.u8     {d16,d17}, [r1]!
-        pld         [r1, #256]
-  .if \interleaved
-        vld2.u8     {d20,d21}, [r3]!
-    .if \swapuv
-        vswp        d20, d21
-    .endif
-        pld         [r3, #256]
-  .else
-        vld1.u8     d20, [r3]!
-        vld1.u8     d21, [r4]!
-        pld         [r3, #128]
-        pld         [r4, #128]
-  .endif
-
-        \kernel
-
-        subs        r2, #16
-
-        vst4.u8     {d0,d2,d4,d6}, [r0]!
-        vst4.u8     {d1,d3,d5,d7}, [r0]!
-
-        bhs         1b
-
-2:      adds        r2, #16
-        beq         2f
-
-        /* To handle the tail portion of the data (something less than 16
-         * bytes) load small power-of-two chunks into working registers.  It
-         * doesn't matter where they end up in the register; the same process
-         * will store them back out using the same positions and the
-         * interaction between neighbouring pixels is constrained to odd
-         * boundaries where the load operations don't interfere.
-         */
-        vmov.i8     q8, #0
-        vmov.i8     q10, #0
-
-        tst         r2, #8
-        beq         1f
-        vld1.u8     d17, [r1]!
-  .if \interleaved
-        vld1.u8     d21, [r3]!
-  .else
-        vld1.u32    d20[1], [r3]!
-        vld1.u32    d21[1], [r4]!
-  .endif
-
-1:      tst         r2, #4
-        beq         1f
-        vld1.u32    d16[1], [r1]!
-  .if \interleaved
-        vld1.u32    d20[1], [r3]!
-  .else
-        vld1.u16    d20[1], [r3]!
-        vld1.u16    d21[1], [r4]!
-  .endif
-1:      tst         r2, #2
-        beq         1f
-        vld1.u16    d16[1], [r1]!
-  .if \interleaved
-        vld1.u16    d20[1], [r3]!
-  .else
-        vld1.u8     d20[1], [r3]!
-        vld1.u8     d21[1], [r4]!
-  .endif
-1:      tst         r2, #1
-        beq         1f
-        vld1.u8     d16[1], [r1]!
-  .if \interleaved
-        vld1.u16    d20[0], [r3]!
-  .else
-        vld1.u8     d20[0], [r3]!
-        vld1.u8     d21[0], [r4]!
-  .endif
-
-        /* One small impediment in the process above is that some of the load
-         * operations can't perform byte-wise structure deinterleaving at the
-         * same time as loading only part of a register.  So the data is loaded
-         * linearly and unpacked manually at this point if necessary.
-         */
-1:      vuzp.8      d16, d17
-  .if \interleaved
-        vuzp.8      d20, d21
-    .if \swapuv
-        vswp        d20, d21
-    .endif
-  .endif
-
-        \kernel
-
-        /* As above but with the output; structured stores for partial vectors
-         * aren't available, so the data is re-packed first and stored linearly.
-         */
-        vzip.8  q0, q2
-        vzip.8  q1, q3
-        vzip.8  q0, q1
-        vzip.8  q2, q3
-
-1:      tst         r2, #8
-        beq         1f
-        vst1.u8     {d4,d5,d6,d7}, [r0]!
-
-1:      tst         r2, #4
-        beq         1f
-        vst1.u8     {d2,d3}, [r0]!
-1:      tst         r2, #2
-        beq         1f
-        vst1.u8     d1, [r0]!
-1:      tst         r2, #1
-        beq         2f
-        vst1.u32    d0[1], [r0]!
-2:
-.endm
-
-
-/*  void rsdIntrinsicYuv2_K(
- *          void *out,          // r0
- *          void const *yin,    // r1
- *          void const *uin,    // r2
- *          void const *vin,    // r3
- *          size_t xstart,      // [sp]
- *          size_t xend);       // [sp+#4]
- */
-ENTRY(rsdIntrinsicYuv2_K)
-        push        {r4,r5}
-        ldr         r5, [sp, #8]
-        mov         r4, r3
-        mov         r3, r2
-        ldr         r2, [sp, #12]
-
-        add         r0, r5, LSL #2
-        add         r1, r5
-        add         r3, r5, LSR #1
-        add         r4, r5, LSR #1
-        sub         r2, r5
-
-        vpush       {d8-d15}
-
-        wrap_line yuvkern, 0
-
-        vpop        {d8-d15}
-        pop         {r4,r5}
-        bx lr
-END(rsdIntrinsicYuv2_K)
-
-/*  void rsdIntrinsicYuv_K(
- *          void *out,          // r0
- *          void const *yin,    // r1
- *          void const *uvin,   // r2
- *          size_t xstart,      // r3
- *          size_t xend);       // [sp]
- */
-ENTRY(rsdIntrinsicYuv_K)
-        push        {r4,r5}
-        bic         r4, r3, #1
-        add         r3, r2, r4
-        ldr         r2, [sp, #8]
-
-        add         r0, r4, LSL #2
-        add         r1, r4
-        sub         r2, r4
-
-        vpush       {d8-d15}
-
-        wrap_line yuvkern, 1, 1
-
-        vpop        {d8-d15}
-        pop         {r4,r5}
-        bx lr
-END(rsdIntrinsicYuv_K)
-
-/*  void rsdIntrinsicYuvR_K(
- *          void *out,          // r0
- *          void const *yin,    // r1
- *          void const *uvin,   // r2
- *          size_t xstart,      // r3
- *          size_t xend);       // [sp]
- */
-ENTRY(rsdIntrinsicYuvR_K)
-        push        {r4,r5}
-        bic         r4, r3, #1
-        add         r3, r2, r4
-        ldr         r2, [sp, #8]
-
-        add         r0, r4, LSL #2
-        add         r1, r4
-        sub         r2, r4
-
-        vpush       {d8-d15}
-
-        wrap_line yuvkern, 1
-
-        vpop        {d8-d15}
-        pop         {r4,r5}
-        bx lr
-END(rsdIntrinsicYuvR_K)

diff --git a/toolkit/java/Toolkit.kt b/toolkit/java/Toolkit.kt
deleted file mode 100644
index 438f241..0000000
--- a/toolkit/java/Toolkit.kt
+++ /dev/null

@@ -1,1566 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.google.android.renderscript
-
-import android.graphics.Bitmap
-import java.lang.IllegalArgumentException
-
-// This string is used for error messages.
-private const val externalName = "RenderScript Toolkit"
-
-/**
- * A collection of high-performance graphic utility functions like blur and blend.
- *
- * This toolkit provides ten image manipulation functions: blend, blur, color matrix, convolve,
- * histogram, histogramDot, lut, lut3d, resize, and YUV to RGB. These functions execute
- * multithreaded on the CPU.
- *
- * Most of the functions have two variants: one that manipulates Bitmaps, the other ByteArrays.
- * For ByteArrays, you need to specify the width and height of the data to be processed, as
- * well as the number of bytes per pixel. For most use cases, this will be 4.
- *
- * The Toolkit creates a thread pool that's used for processing the functions. The threads live
- * for the duration of the application. They can be destroyed by calling the method shutdown().
- *
- * This library is thread safe. You can call methods from different poolThreads. The functions will
- * execute sequentially.
- *
- * A native C++ version of this Toolkit is available. Check the RenderScriptToolkit.h file in the
- * cpp directory.
- *
- * This toolkit can be used as a replacement for most RenderScript Intrinsic functions. Compared
- * to RenderScript, it's simpler to use and more than twice as fast on the CPU. However RenderScript
- * Intrinsics allow more flexibility for the type of allocation supported. In particular, this
- * toolkit does not support allocations of floats.
- */
-object Toolkit {
-    /**
-     * Blends a source buffer with the destination buffer.
-     *
-     * Blends a source buffer and a destination buffer, placing the result in the destination
-     * buffer. The blending is done pairwise between two corresponding RGBA values found in
-     * each buffer. The mode parameter specifies one of fifteen supported blending operations.
-     * See {@link BlendingMode}.
-     *
-     * A variant of this method is also available to blend Bitmaps.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY.
-     *
-     * The source and destination buffer must have the same dimensions. Both arrays should have
-     * a size greater or equal to sizeX * sizeY * 4. The buffers have a row-major layout.
-     *
-     * @param mode The specific blending operation to do.
-     * @param sourceArray The RGBA input buffer.
-     * @param destArray The destination buffer. Used for input and output.
-     * @param sizeX The width of both buffers, as a number of RGBA values.
-     * @param sizeY The height of both buffers, as a number of RGBA values.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     */
-    @JvmOverloads
-    fun blend(
-        mode: BlendingMode,
-        sourceArray: ByteArray,
-        destArray: ByteArray,
-        sizeX: Int,
-        sizeY: Int,
-        restriction: Range2d? = null
-    ) {
-        require(sourceArray.size >= sizeX * sizeY * 4) {
-            "$externalName blend. sourceArray is too small for the given dimensions. " +
-                    "$sizeX*$sizeY*4 < ${sourceArray.size}."
-        }
-        require(destArray.size >= sizeX * sizeY * 4) {
-            "$externalName blend. sourceArray is too small for the given dimensions. " +
-                    "$sizeX*$sizeY*4 < ${sourceArray.size}."
-        }
-        validateRestriction("blend", sizeX, sizeY, restriction)
-
-        nativeBlend(nativeHandle, mode.value, sourceArray, destArray, sizeX, sizeY, restriction)
-    }
-
-    /**
-     * Blends a source bitmap with the destination bitmap.
-     *
-     * Blends a source bitmap and a destination bitmap, placing the result in the destination
-     * bitmap. The blending is done pairwise between two corresponding RGBA values found in
-     * each bitmap. The mode parameter specify one of fifteen supported blending operations.
-     * See {@link BlendingMode}.
-     *
-     * A variant of this method is available to blend ByteArrays.
-     *
-     * The bitmaps should have identical width and height, and have a config of ARGB_8888.
-     * Bitmaps with a stride different than width * vectorSize are not currently supported.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each bitmap. If provided, the range must be wholly contained with the dimensions
-     * of the bitmap.
-     *
-     * @param mode The specific blending operation to do.
-     * @param sourceBitmap The RGBA input buffer.
-     * @param destBitmap The destination buffer. Used for input and output.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     */
-    @JvmOverloads
-    fun blend(
-        mode: BlendingMode,
-        sourceBitmap: Bitmap,
-        destBitmap: Bitmap,
-        restriction: Range2d? = null
-    ) {
-        validateBitmap("blend", sourceBitmap)
-        validateBitmap("blend", destBitmap)
-        require(
-            sourceBitmap.width == destBitmap.width &&
-                    sourceBitmap.height == destBitmap.height
-        ) {
-            "$externalName blend. Source and destination bitmaps should be the same size. " +
-                    "${sourceBitmap.width}x${sourceBitmap.height} and " +
-                    "${destBitmap.width}x${destBitmap.height} provided."
-        }
-        require(sourceBitmap.config == destBitmap.config) {
-            "RenderScript Toolkit blend. Source and destination bitmaps should have the same " +
-                    "config. ${sourceBitmap.config} and ${destBitmap.config} provided."
-        }
-        validateRestriction("blend", sourceBitmap.width, sourceBitmap.height, restriction)
-
-        nativeBlendBitmap(nativeHandle, mode.value, sourceBitmap, destBitmap, restriction)
-    }
-
-    /**
-     * Blurs an image.
-     *
-     * Performs a Gaussian blur of an image and returns result in a ByteArray buffer. A variant of
-     * this method is available to blur Bitmaps.
-     *
-     * The radius determines which pixels are used to compute each blurred pixels. This Toolkit
-     * accepts values between 1 and 25. Larger values create a more blurred effect but also
-     * take longer to compute. When the radius extends past the edge, the edge pixel will
-     * be used as replacement for the pixel that's out off boundary.
-     *
-     * Each input pixel can either be represented by four bytes (RGBA format) or one byte
-     * for the less common blurring of alpha channel only image.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY. NOTE: The output buffer will still be full size, with the
-     * section that's not blurred all set to 0. This is to stay compatible with RenderScript.
-     *
-     * The source buffer should be large enough for sizeX * sizeY * mVectorSize bytes. It has a
-     * row-major layout.
-     *
-     * @param inputArray The buffer of the image to be blurred.
-     * @param vectorSize Either 1 or 4, the number of bytes in each cell, i.e. A vs. RGBA.
-     * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
-     * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
-     * @param radius The radius of the pixels used to blur, a value from 1 to 25.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The blurred pixels, a ByteArray of size.
-     */
-    @JvmOverloads
-    fun blur(
-        inputArray: ByteArray,
-        vectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        radius: Int = 5,
-        restriction: Range2d? = null
-    ): ByteArray {
-        require(vectorSize == 1 || vectorSize == 4) {
-            "$externalName blur. The vectorSize should be 1 or 4. $vectorSize provided."
-        }
-        require(inputArray.size >= sizeX * sizeY * vectorSize) {
-            "$externalName blur. inputArray is too small for the given dimensions. " +
-                    "$sizeX*$sizeY*$vectorSize < ${inputArray.size}."
-        }
-        require(radius in 1..25) {
-            "$externalName blur. The radius should be between 1 and 25. $radius provided."
-        }
-        validateRestriction("blur", sizeX, sizeY, restriction)
-
-        val outputArray = ByteArray(inputArray.size)
-        nativeBlur(
-            nativeHandle, inputArray, vectorSize, sizeX, sizeY, radius, outputArray, restriction
-        )
-        return outputArray
-    }
-
-    /**
-     * Blurs an image.
-     *
-     * Performs a Gaussian blur of a Bitmap and returns result as a Bitmap. A variant of
-     * this method is available to blur ByteArrays.
-     *
-     * The radius determines which pixels are used to compute each blurred pixels. This Toolkit
-     * accepts values between 1 and 25. Larger values create a more blurred effect but also
-     * take longer to compute. When the radius extends past the edge, the edge pixel will
-     * be used as replacement for the pixel that's out off boundary.
-     *
-     * This method supports input Bitmap of config ARGB_8888 and ALPHA_8. Bitmaps with a stride
-     * different than width * vectorSize are not currently supported. The returned Bitmap has the
-     * same config.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY. NOTE: The output Bitmap will still be full size, with the
-     * section that's not blurred all set to 0. This is to stay compatible with RenderScript.
-     *
-     * @param inputBitmap The buffer of the image to be blurred.
-     * @param radius The radius of the pixels used to blur, a value from 1 to 25. Default is 5.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The blurred Bitmap.
-     */
-    @JvmOverloads
-    fun blur(inputBitmap: Bitmap, radius: Int = 5, restriction: Range2d? = null): Bitmap {
-        validateBitmap("blur", inputBitmap)
-        require(radius in 1..25) {
-            "$externalName blur. The radius should be between 1 and 25. $radius provided."
-        }
-        validateRestriction("blur", inputBitmap.width, inputBitmap.height, restriction)
-
-        val outputBitmap = createCompatibleBitmap(inputBitmap)
-        nativeBlurBitmap(nativeHandle, inputBitmap, outputBitmap, radius, restriction)
-        return outputBitmap
-    }
-
-    /**
-     * Identity matrix that can be passed to the {@link RenderScriptToolkit::colorMatrix} method.
-     *
-     * Using this matrix will result in no change to the pixel through multiplication although
-     * the pixel value can still be modified by the add vector, or transformed to a different
-     * format.
-     */
-    val identityMatrix: FloatArray
-        get() = floatArrayOf(
-            1f, 0f, 0f, 0f,
-            0f, 1f, 0f, 0f,
-            0f, 0f, 1f, 0f,
-            0f, 0f, 0f, 1f
-        )
-
-    /**
-     * Matrix to turn color pixels to a grey scale.
-     *
-     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert an
-     * image from color to greyscale.
-     */
-    val greyScaleColorMatrix: FloatArray
-        get() = floatArrayOf(
-            0.299f, 0.299f, 0.299f, 0f,
-            0.587f, 0.587f, 0.587f, 0f,
-            0.114f, 0.114f, 0.114f, 0f,
-            0f, 0f, 0f, 1f
-        )
-
-    /**
-     * Matrix to convert RGB to YUV.
-     *
-     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
-     * first three bytes of each pixel from RGB to YUV. This leaves the last byte (the alpha
-     * channel) untouched.
-     *
-     * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
-     * by this method.
-     */
-    val rgbToYuvMatrix: FloatArray
-        get() = floatArrayOf(
-            0.299f, -0.14713f, 0.615f, 0f,
-            0.587f, -0.28886f, -0.51499f, 0f,
-            0.114f, 0.436f, -0.10001f, 0f,
-            0f, 0f, 0f, 1f
-        )
-
-    /**
-     * Matrix to convert YUV to RGB.
-     *
-     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
-     * first three bytes of each pixel from YUV to RGB. This leaves the last byte (the alpha
-     * channel) untouched.
-     *
-     * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
-     * by this method. Use {@link RenderScriptToolkit::yuvToRgb} to convert these buffers.
-     */
-    val yuvToRgbMatrix: FloatArray
-        get() = floatArrayOf(
-            1f, 1f, 1f, 0f,
-            0f, -0.39465f, 2.03211f, 0f,
-            1.13983f, -0.5806f, 0f, 0f,
-            0f, 0f, 0f, 1f
-        )
-
-    /**
-     * Transform an image using a color matrix.
-     *
-     * Converts a 2D array of vectors of unsigned bytes, multiplying each vectors by a 4x4 matrix
-     * and adding an optional vector.
-     *
-     * Each input vector is composed of 1-4 unsigned bytes. If less than 4 bytes, it's extended to
-     * 4, padding with zeroes. The unsigned bytes are converted from 0-255 to 0.0-1.0 floats
-     * before the multiplication is done.
-     *
-     * The resulting value is normalized from 0.0-1.0 to a 0-255 value and stored in the output.
-     * If the output vector size is less than four, the unused channels are discarded.
-     *
-     * If addVector is not specified, a vector of zeroes is added, i.e. a noop.
-     *
-     * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
-     *
-     * Check identityMatrix, greyScaleColorMatrix, rgbToYuvMatrix, and yuvToRgbMatrix for sample
-     * matrices. The YUV conversion may not work for all color spaces.
-     *
-     * @param inputArray The buffer of the image to be converted.
-     * @param inputVectorSize The number of bytes in each input cell, a value from 1 to 4.
-     * @param sizeX The width of both buffers, as a number of 1 to 4 byte cells.
-     * @param sizeY The height of both buffers, as a number of 1 to 4 byte cells.
-     * @param outputVectorSize The number of bytes in each output cell, a value from 1 to 4.
-     * @param matrix The 4x4 matrix to multiply, in row major format.
-     * @param addVector A vector of four floats that's added to the result of the multiplication.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The converted buffer.
-     */
-    @JvmOverloads
-    fun colorMatrix(
-        inputArray: ByteArray,
-        inputVectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        outputVectorSize: Int,
-        matrix: FloatArray,
-        addVector: FloatArray = floatArrayOf(0f, 0f, 0f, 0f),
-        restriction: Range2d? = null
-    ): ByteArray {
-        require(inputVectorSize in 1..4) {
-            "$externalName colorMatrix. The inputVectorSize should be between 1 and 4. " +
-                    "$inputVectorSize provided."
-        }
-        require(outputVectorSize in 1..4) {
-            "$externalName colorMatrix. The outputVectorSize should be between 1 and 4. " +
-                    "$outputVectorSize provided."
-        }
-        require(inputArray.size >= sizeX * sizeY * inputVectorSize) {
-            "$externalName colorMatrix. inputArray is too small for the given dimensions. " +
-                    "$sizeX*$sizeY*$inputVectorSize < ${inputArray.size}."
-        }
-        require(matrix.size == 16) {
-            "$externalName colorMatrix. matrix should have 16 entries. ${matrix.size} provided."
-        }
-        require(addVector.size == 4) {
-            "$externalName colorMatrix. addVector should have 4 entries. " +
-                    "${addVector.size} provided."
-        }
-        validateRestriction("colorMatrix", sizeX, sizeY, restriction)
-
-        val outputArray = ByteArray(sizeX * sizeY * paddedSize(outputVectorSize))
-        nativeColorMatrix(
-            nativeHandle, inputArray, inputVectorSize, sizeX, sizeY, outputArray, outputVectorSize,
-            matrix, addVector, restriction
-        )
-        return outputArray
-    }
-
-    /**
-     * Transform an image using a color matrix.
-     *
-     * Converts a bitmap, multiplying each RGBA value by a 4x4 matrix and adding an optional vector.
-     * Each byte of the RGBA is converted from 0-255 to 0.0-1.0 floats before the multiplication
-     * is done.
-     *
-     * Bitmaps with a stride different than width * vectorSize are not currently supported.
-     *
-     * The resulting value is normalized from 0.0-1.0 to a 0-255 value and stored in the output.
-     *
-     * If addVector is not specified, a vector of zeroes is added, i.e. a noop.
-     *
-     * Check identityMatrix, greyScaleColorMatrix, rgbToYuvMatrix, and yuvToRgbMatrix for sample
-     * matrices. The YUV conversion may not work for all color spaces.
-     *
-     * @param inputBitmap The image to be converted.
-     * @param matrix The 4x4 matrix to multiply, in row major format.
-     * @param addVector A vector of four floats that's added to the result of the multiplication.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The converted buffer.
-     */
-    @JvmOverloads
-    fun colorMatrix(
-        inputBitmap: Bitmap,
-        matrix: FloatArray,
-        addVector: FloatArray = floatArrayOf(0f, 0f, 0f, 0f),
-        restriction: Range2d? = null
-    ): Bitmap {
-        validateBitmap("colorMatrix", inputBitmap)
-        require(matrix.size == 16) {
-            "$externalName colorMatrix. matrix should have 16 entries. ${matrix.size} provided."
-        }
-        require(addVector.size == 4) {
-            "$externalName colorMatrix. addVector should have 4 entries."
-        }
-        validateRestriction("colorMatrix", inputBitmap.width, inputBitmap.height, restriction)
-
-        val outputBitmap = createCompatibleBitmap(inputBitmap)
-        nativeColorMatrixBitmap(
-            nativeHandle,
-            inputBitmap,
-            outputBitmap,
-            matrix,
-            addVector,
-            restriction
-        )
-        return outputBitmap
-    }
-
-    /**
-     * Convolve a ByteArray.
-     *
-     * Applies a 3x3 or 5x5 convolution to the input array using the provided coefficients.
-     * A variant of this method is available to convolve Bitmaps.
-     *
-     * For 3x3 convolutions, 9 coefficients must be provided. For 5x5, 25 coefficients are needed.
-     * The coefficients should be provided in row-major format.
-     *
-     * When the square extends past the edge, the edge values will be used as replacement for the
-     * values that's are off boundary.
-     *
-     * Each input cell can either be represented by one to four bytes. Each byte is multiplied
-     * and accumulated independently of the other bytes of the cell.
-     *
-     * An optional range parameter can be set to restrict the convolve operation to a rectangular
-     * subset of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY. NOTE: The output buffer will still be full size, with the
-     * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
-     *
-     * The source array should be large enough for sizeX * sizeY * vectorSize bytes. It has a
-     * row-major layout. The output array will have the same dimensions.
-     *
-     * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
-     *
-     * @param inputArray The buffer of the image to be blurred.
-     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
-     * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
-     * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
-     * @param coefficients A FloatArray of size 9 or 25, containing the multipliers.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The convolved array.
-     */
-    @JvmOverloads
-    fun convolve(
-        inputArray: ByteArray,
-        vectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        coefficients: FloatArray,
-        restriction: Range2d? = null
-    ): ByteArray {
-        require(vectorSize in 1..4) {
-            "$externalName convolve. The vectorSize should be between 1 and 4. " +
-                    "$vectorSize provided."
-        }
-        require(inputArray.size >= sizeX * sizeY * vectorSize) {
-            "$externalName convolve. inputArray is too small for the given dimensions. " +
-                    "$sizeX*$sizeY*$vectorSize < ${inputArray.size}."
-        }
-        require(coefficients.size == 9 || coefficients.size == 25) {
-            "$externalName convolve. Only 3x3 or 5x5 convolutions are supported. " +
-                    "${coefficients.size} coefficients provided."
-        }
-        validateRestriction("convolve", sizeX, sizeY, restriction)
-
-        val outputArray = ByteArray(inputArray.size)
-        nativeConvolve(
-            nativeHandle,
-            inputArray,
-            vectorSize,
-            sizeX,
-            sizeY,
-            outputArray,
-            coefficients,
-            restriction
-        )
-        return outputArray
-    }
-
-    /**
-     * Convolve a Bitmap.
-     *
-     * Applies a 3x3 or 5x5 convolution to the input Bitmap using the provided coefficients.
-     * A variant of this method is available to convolve ByteArrays. Bitmaps with a stride different
-     * than width * vectorSize are not currently supported.
-     *
-     * For 3x3 convolutions, 9 coefficients must be provided. For 5x5, 25 coefficients are needed.
-     * The coefficients should be provided in row-major format.
-     *
-     * Each input cell can either be represented by one to four bytes. Each byte is multiplied
-     * and accumulated independently of the other bytes of the cell.
-     *
-     * An optional range parameter can be set to restrict the convolve operation to a rectangular
-     * subset of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY. NOTE: The output Bitmap will still be full size, with the
-     * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
-     *
-     * @param inputBitmap The image to be blurred.
-     * @param coefficients A FloatArray of size 9 or 25, containing the multipliers.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The convolved Bitmap.
-     */
-    @JvmOverloads
-    fun convolve(
-        inputBitmap: Bitmap,
-        coefficients: FloatArray,
-        restriction: Range2d? = null
-    ): Bitmap {
-        validateBitmap("convolve", inputBitmap)
-        require(coefficients.size == 9 || coefficients.size == 25) {
-            "$externalName convolve. Only 3x3 or 5x5 convolutions are supported. " +
-                    "${coefficients.size} coefficients provided."
-        }
-        validateRestriction("convolve", inputBitmap, restriction)
-
-        val outputBitmap = createCompatibleBitmap(inputBitmap)
-        nativeConvolveBitmap(nativeHandle, inputBitmap, outputBitmap, coefficients, restriction)
-        return outputBitmap
-    }
-
-    /**
-     * Compute the histogram of an image.
-     *
-     * Tallies how many times each of the 256 possible values of a byte is found in the input.
-     * A variant of this method is available to do the histogram of a Bitmap.
-     *
-     * An input cell can be represented by one to four bytes. The tally is done independently
-     * for each of the bytes of the cell. Correspondingly, the returned IntArray will have
-     * 256 * vectorSize entries. The counts for value 0 are consecutive, followed by those for
-     * value 1, etc.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY.
-     *
-     * The source buffer should be large enough for sizeX * sizeY * vectorSize bytes. It has a
-     * row-major layout.
-     *
-     * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
-     *
-     * @param inputArray The buffer of the image to be analyzed.
-     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
-     * @param sizeX The width of the input buffers, as a number of 1 to 4 byte cells.
-     * @param sizeY The height of the input buffers, as a number of 1 to 4 byte cells.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The resulting array of counts.
-     */
-    @JvmOverloads
-    fun histogram(
-        inputArray: ByteArray,
-        vectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        restriction: Range2d? = null
-    ): IntArray {
-        require(vectorSize in 1..4) {
-            "$externalName histogram. The vectorSize should be between 1 and 4. " +
-                    "$vectorSize provided."
-        }
-        require(inputArray.size >= sizeX * sizeY * vectorSize) {
-            "$externalName histogram. inputArray is too small for the given dimensions. " +
-                    "$sizeX*$sizeY*$vectorSize < ${inputArray.size}."
-        }
-        validateRestriction("histogram", sizeX, sizeY, restriction)
-
-        val outputArray = IntArray(256 * paddedSize(vectorSize))
-        nativeHistogram(
-            nativeHandle,
-            inputArray,
-            vectorSize,
-            sizeX,
-            sizeY,
-            outputArray,
-            restriction
-        )
-        return outputArray
-    }
-
-    /**
-     * Compute the histogram of an image.
-     *
-     * Tallies how many times each of the 256 possible values of a byte is found in the bitmap.
-     * This method supports Bitmaps of config ARGB_8888 and ALPHA_8.
-     *
-     * For ARGB_8888, the tally is done independently of the four bytes. Correspondingly, the
-     * returned IntArray will have 4 * 256 entries. The counts for value 0 are consecutive,
-     * followed by those for value 1, etc.
-     *
-     * For ALPHA_8, an IntArray of size 256 is returned.
-     *
-     * Bitmaps with a stride different than width * vectorSize are not currently supported.
-     *
-     * A variant of this method is available to do the histogram of a ByteArray.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY.
-     *
-     * @param inputBitmap The bitmap to be analyzed.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The resulting array of counts.
-     */
-    @JvmOverloads
-    fun histogram(
-        inputBitmap: Bitmap,
-        restriction: Range2d? = null
-    ): IntArray {
-        validateBitmap("histogram", inputBitmap)
-        validateRestriction("histogram", inputBitmap, restriction)
-
-        val outputArray = IntArray(256 * vectorSize(inputBitmap))
-        nativeHistogramBitmap(nativeHandle, inputBitmap, outputArray, restriction)
-        return outputArray
-    }
-
-    /**
-     * Compute the histogram of the dot product of an image.
-     *
-     * This method supports cells of 1 to 4 bytes in length. For each cell of the array,
-     * the dot product of its bytes with the provided coefficients is computed. The resulting
-     * floating point value is converted to an unsigned byte and tallied in the histogram.
-     *
-     * If coefficients is null, the coefficients used for RGBA luminosity calculation will be used,
-     * i.e. the values [0.299f, 0.587f, 0.114f, 0.f].
-     *
-     * Each coefficients must be >= 0 and their sum must be 1.0 or less. There must be the same
-     * number of coefficients as vectorSize.
-     *
-     * A variant of this method is available to do the histogram of a Bitmap.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY.
-     *
-     * The source buffer should be large enough for sizeX * sizeY * vectorSize bytes. The returned
-     * array will have 256 ints.
-     *
-     * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
-     *
-     * @param inputArray The buffer of the image to be analyzed.
-     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
-     * @param sizeX The width of the input buffers, as a number of 1 to 4 byte cells.
-     * @param sizeY The height of the input buffers, as a number of 1 to 4 byte cells.
-     * @param coefficients The dot product multipliers. Size should equal vectorSize. Can be null.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The resulting vector of counts.
-     */
-    @JvmOverloads
-    fun histogramDot(
-        inputArray: ByteArray,
-        vectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        coefficients: FloatArray? = null,
-        restriction: Range2d? = null
-    ): IntArray {
-        require(vectorSize in 1..4) {
-            "$externalName histogramDot. The vectorSize should be between 1 and 4. " +
-                    "$vectorSize provided."
-        }
-        require(inputArray.size >= sizeX * sizeY * vectorSize) {
-            "$externalName histogramDot. inputArray is too small for the given dimensions. " +
-                    "$sizeX*$sizeY*$vectorSize < ${inputArray.size}."
-        }
-        validateHistogramDotCoefficients(coefficients, vectorSize)
-        validateRestriction("histogramDot", sizeX, sizeY, restriction)
-
-        val outputArray = IntArray(256)
-        val actualCoefficients = coefficients ?: floatArrayOf(0.299f, 0.587f, 0.114f, 0f)
-        nativeHistogramDot(
-            nativeHandle,
-            inputArray,
-            vectorSize,
-            sizeX,
-            sizeY,
-            outputArray,
-            actualCoefficients,
-            restriction
-        )
-        return outputArray
-    }
-
-    /**
-     * Compute the histogram of the dot product of an image.
-     *
-     * This method supports Bitmaps of config ARGB_8888 and ALPHA_8. For each pixel of the bitmap,
-     * the dot product of its bytes with the provided coefficients is computed. The resulting
-     * floating point value is converted to an unsigned byte and tallied in the histogram.
-     *
-     * If coefficients is null, the coefficients used for RGBA luminosity calculation will be used,
-     * i.e. the values [0.299f, 0.587f, 0.114f, 0.f].
-     *
-     * Each coefficients must be >= 0 and their sum must be 1.0 or less. For ARGB_8888, four values
-     * must be provided; for ALPHA_8, one.
-     *
-     * Bitmaps with a stride different than width * vectorSize are not currently supported.
-     *
-     * A variant of this method is available to do the histogram of a ByteArray.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY.
-     *
-     * The returned array will have 256 ints.
-     *
-     * @param inputBitmap The bitmap to be analyzed.
-     * @param coefficients The one or four values used for the dot product. Can be null.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The resulting vector of counts.
-     */
-    @JvmOverloads
-    fun histogramDot(
-        inputBitmap: Bitmap,
-        coefficients: FloatArray? = null,
-        restriction: Range2d? = null
-    ): IntArray {
-        validateBitmap("histogramDot", inputBitmap)
-        validateHistogramDotCoefficients(coefficients, vectorSize(inputBitmap))
-        validateRestriction("histogramDot", inputBitmap, restriction)
-
-        val outputArray = IntArray(256)
-        val actualCoefficients = coefficients ?: floatArrayOf(0.299f, 0.587f, 0.114f, 0f)
-        nativeHistogramDotBitmap(
-            nativeHandle, inputBitmap, outputArray, actualCoefficients, restriction
-        )
-        return outputArray
-    }
-
-    /**
-     * Transform an image using a look up table
-     *
-     * Transforms an image by using a per-channel lookup table. Each channel of the input has an
-     * independent lookup table. The tables are 256 entries in size and can cover the full value
-     * range of a byte.
-     *
-     * The input array should be in RGBA format, where four consecutive bytes form an cell.
-     * A variant of this method is available to transform a Bitmap.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY. NOTE: The output Bitmap will still be full size, with the
-     * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
-     *
-     * The source array should be large enough for sizeX * sizeY * vectorSize bytes. The returned
-     * ray has the same dimensions as the input. The arrays have a row-major layout.
-     *
-     * @param inputArray The buffer of the image to be transformed.
-     * @param sizeX The width of both buffers, as a number of 4 byte cells.
-     * @param sizeY The height of both buffers, as a number of 4 byte cells.
-     * @param table The four arrays of 256 values that's used to convert each channel.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The transformed image.
-     */
-    @JvmOverloads
-    fun lut(
-        inputArray: ByteArray,
-        sizeX: Int,
-        sizeY: Int,
-        table: LookupTable,
-        restriction: Range2d? = null
-    ): ByteArray {
-        require(inputArray.size >= sizeX * sizeY * 4) {
-            "$externalName lut. inputArray is too small for the given dimensions. " +
-                    "$sizeX*$sizeY*4 < ${inputArray.size}."
-        }
-        validateRestriction("lut", sizeX, sizeY, restriction)
-
-        val outputArray = ByteArray(inputArray.size)
-        nativeLut(
-            nativeHandle,
-            inputArray,
-            outputArray,
-            sizeX,
-            sizeY,
-            table.red,
-            table.green,
-            table.blue,
-            table.alpha,
-            restriction
-        )
-        return outputArray
-    }
-
-    /**
-     * Transform an image using a look up table
-     *
-     * Transforms an image by using a per-channel lookup table. Each channel of the input has an
-     * independent lookup table. The tables are 256 entries in size and can cover the full value
-     * range of a byte.
-     *
-     * The input Bitmap should be in config ARGB_8888. A variant of this method is available to
-     * transform a ByteArray. Bitmaps with a stride different than width * vectorSize are not
-     * currently supported.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY. NOTE: The output Bitmap will still be full size, with the
-     * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
-     *
-     * @param inputBitmap The buffer of the image to be transformed.
-     * @param table The four arrays of 256 values that's used to convert each channel.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The transformed image.
-     */
-    @JvmOverloads
-    fun lut(
-        inputBitmap: Bitmap,
-        table: LookupTable,
-        restriction: Range2d? = null
-    ): Bitmap {
-        validateBitmap("lut", inputBitmap)
-        validateRestriction("lut", inputBitmap, restriction)
-
-        val outputBitmap = createCompatibleBitmap(inputBitmap)
-        nativeLutBitmap(
-            nativeHandle,
-            inputBitmap,
-            outputBitmap,
-            table.red,
-            table.green,
-            table.blue,
-            table.alpha,
-            restriction
-        )
-        return outputBitmap
-    }
-
-    /**
-     * Transform an image using a 3D look up table
-     *
-     * Transforms an image, converting RGB to RGBA by using a 3D lookup table. The incoming R, G,
-     * and B values are normalized to the dimensions of the provided 3D buffer. The eight nearest
-     * values in that 3D buffer are sampled and linearly interpolated. The resulting RGBA entry
-     * is returned in the output array.
-     *
-     * The input array should be in RGBA format, where four consecutive bytes form an cell.
-     * The fourth byte of each input cell is ignored. A variant of this method is also available
-     * to transform Bitmaps.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY. NOTE: The output array will still be full size, with the
-     * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
-     *
-     * The source array should be large enough for sizeX * sizeY * vectorSize bytes. The returned
-     * array will have the same dimensions. The arrays have a row-major layout.
-     *
-     * @param inputArray The buffer of the image to be transformed.
-     * @param sizeX The width of both buffers, as a number of 4 byte cells.
-     * @param sizeY The height of both buffers, as a number of 4 byte cells.
-     * @param cube The translation cube.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The transformed image.
-     */
-    @JvmOverloads
-    fun lut3d(
-        inputArray: ByteArray,
-        sizeX: Int,
-        sizeY: Int,
-        cube: Rgba3dArray,
-        restriction: Range2d? = null
-    ): ByteArray {
-        require(inputArray.size >= sizeX * sizeY * 4) {
-            "$externalName lut3d. inputArray is too small for the given dimensions. " +
-                    "$sizeX*$sizeY*4 < ${inputArray.size}."
-        }
-        require(
-            cube.sizeX >= 2 && cube.sizeY >= 2 && cube.sizeZ >= 2 &&
-                    cube.sizeX <= 256 && cube.sizeY <= 256 && cube.sizeZ <= 256
-        ) {
-            "$externalName lut3d. The dimensions of the cube should be between 2 and 256. " +
-                    "(${cube.sizeX}, ${cube.sizeY}, ${cube.sizeZ}) provided."
-        }
-        validateRestriction("lut3d", sizeX, sizeY, restriction)
-
-        val outputArray = ByteArray(inputArray.size)
-        nativeLut3d(
-            nativeHandle, inputArray, outputArray, sizeX, sizeY, cube.values, cube.sizeX,
-            cube.sizeY, cube.sizeZ, restriction
-        )
-        return outputArray
-    }
-
-    /**
-     * Transform an image using a 3D look up table
-     *
-     * Transforms an image, converting RGB to RGBA by using a 3D lookup table. The incoming R, G,
-     * and B values are normalized to the dimensions of the provided 3D buffer. The eight nearest
-     * values in that 3D buffer are sampled and linearly interpolated. The resulting RGBA entry
-     * is returned in the output array.
-     *
-     * The input bitmap should be in RGBA_8888 format. The A channel is preserved. A variant of this
-     * method is also available to transform ByteArray. Bitmaps with a stride different than
-     * width * vectorSize are not currently supported.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of each buffer. If provided, the range must be wholly contained with the dimensions
-     * described by sizeX and sizeY. NOTE: The output array will still be full size, with the
-     * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
-     *
-     * The source array should be large enough for sizeX * sizeY * vectorSize bytes. The returned
-     * array will have the same dimensions. The arrays have a row-major layout.
-     *
-     * @param inputBitmap The image to be transformed.
-     * @param cube The translation cube.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return The transformed image.
-     */
-    @JvmOverloads
-    fun lut3d(
-        inputBitmap: Bitmap,
-        cube: Rgba3dArray,
-        restriction: Range2d? = null
-    ): Bitmap {
-        validateBitmap("lut3d", inputBitmap)
-        validateRestriction("lut3d", inputBitmap, restriction)
-
-        val outputBitmap = createCompatibleBitmap(inputBitmap)
-        nativeLut3dBitmap(
-            nativeHandle, inputBitmap, outputBitmap, cube.values, cube.sizeX,
-            cube.sizeY, cube.sizeZ, restriction
-        )
-        return outputBitmap
-    }
-
-    /**
-     * Resize an image.
-     *
-     * Resizes an image using bicubic interpolation.
-     *
-     * This method supports elements of 1 to 4 bytes in length. Each byte of the element is
-     * interpolated independently from the others.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of the output buffer. The corresponding scaled range of the input will be used. If provided,
-     * the range must be wholly contained with the dimensions described by outputSizeX and
-     * outputSizeY.
-     *
-     * The input and output arrays have a row-major layout. The input array should be
-     * large enough for sizeX * sizeY * vectorSize bytes.
-     *
-     * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
-     *
-     * @param inputArray The buffer of the image to be resized.
-     * @param vectorSize The number of bytes in each element of both buffers. A value from 1 to 4.
-     * @param inputSizeX The width of the input buffer, as a number of 1-4 byte elements.
-     * @param inputSizeY The height of the input buffer, as a number of 1-4 byte elements.
-     * @param outputSizeX The width of the output buffer, as a number of 1-4 byte elements.
-     * @param outputSizeY The height of the output buffer, as a number of 1-4 byte elements.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return An array that contains the rescaled image.
-     */
-    @JvmOverloads
-    fun resize(
-        inputArray: ByteArray,
-        vectorSize: Int,
-        inputSizeX: Int,
-        inputSizeY: Int,
-        outputSizeX: Int,
-        outputSizeY: Int,
-        restriction: Range2d? = null
-    ): ByteArray {
-        require(vectorSize in 1..4) {
-            "$externalName resize. The vectorSize should be between 1 and 4. $vectorSize provided."
-        }
-        require(inputArray.size >= inputSizeX * inputSizeY * vectorSize) {
-            "$externalName resize. inputArray is too small for the given dimensions. " +
-                    "$inputSizeX*$inputSizeY*$vectorSize < ${inputArray.size}."
-        }
-        validateRestriction("resize", outputSizeX, outputSizeY, restriction)
-
-        val outputArray = ByteArray(outputSizeX * outputSizeY * paddedSize(vectorSize))
-        nativeResize(
-            nativeHandle,
-            inputArray,
-            vectorSize,
-            inputSizeX,
-            inputSizeY,
-            outputArray,
-            outputSizeX,
-            outputSizeY,
-            restriction
-        )
-        return outputArray
-    }
-
-    /**
-     * Resize an image.
-     *
-     * Resizes an image using bicubic interpolation.
-     *
-     * This method supports input Bitmap of config ARGB_8888 and ALPHA_8. The returned Bitmap
-     * has the same config. Bitmaps with a stride different than width * vectorSize are not
-     * currently supported.
-     *
-     * An optional range parameter can be set to restrict the operation to a rectangular subset
-     * of the output buffer. The corresponding scaled range of the input will be used. If provided,
-     * the range must be wholly contained with the dimensions described by outputSizeX and
-     * outputSizeY.
-     *
-     * @param inputBitmap The Bitmap to be resized.
-     * @param outputSizeX The width of the output buffer, as a number of 1-4 byte elements.
-     * @param outputSizeY The height of the output buffer, as a number of 1-4 byte elements.
-     * @param restriction When not null, restricts the operation to a 2D range of pixels.
-     * @return A Bitmap that contains the rescaled image.
-     */
-    @JvmOverloads
-    fun resize(
-        inputBitmap: Bitmap,
-        outputSizeX: Int,
-        outputSizeY: Int,
-        restriction: Range2d? = null
-    ): Bitmap {
-        validateBitmap("resize", inputBitmap)
-        validateRestriction("resize", outputSizeX, outputSizeY, restriction)
-
-        val outputBitmap = Bitmap.createBitmap(outputSizeX, outputSizeY, Bitmap.Config.ARGB_8888)
-        nativeResizeBitmap(nativeHandle, inputBitmap, outputBitmap, restriction)
-        return outputBitmap
-    }
-
-    /**
-     * Convert an image from YUV to RGB.
-     *
-     * Converts a YUV buffer to RGB. The input array should be supplied in a supported YUV format.
-     * The output is RGBA; the alpha channel will be set to 255.
-     *
-     * Note that for YV12 and a sizeX that's not a multiple of 32, the RenderScript Intrinsic may
-     * not have converted the image correctly. This Toolkit method should.
-     *
-     * @param inputArray The buffer of the image to be converted.
-     * @param sizeX The width in pixels of the image.
-     * @param sizeY The height in pixels of the image.
-     * @param format Either YV12 or NV21.
-     * @return The converted image as a byte array.
-     */
-    fun yuvToRgb(inputArray: ByteArray, sizeX: Int, sizeY: Int, format: YuvFormat): ByteArray {
-        require(sizeX % 2 == 0 && sizeY % 2 == 0) {
-            "$externalName yuvToRgb. Non-even dimensions are not supported. " +
-                    "$sizeX and $sizeY were provided."
-        }
-
-        val outputArray = ByteArray(sizeX * sizeY * 4)
-        nativeYuvToRgb(nativeHandle, inputArray, outputArray, sizeX, sizeY, format.value)
-        return outputArray
-    }
-
-    /**
-     * Convert an image from YUV to an RGB Bitmap.
-     *
-     * Converts a YUV buffer to an RGB Bitmap. The input array should be supplied in a supported
-     * YUV format. The output is RGBA; the alpha channel will be set to 255.
-     *
-     * Note that for YV12 and a sizeX that's not a multiple of 32, the RenderScript Intrinsic may
-     * not have converted the image correctly. This Toolkit method should.
-     *
-     * @param inputArray The buffer of the image to be converted.
-     * @param sizeX The width in pixels of the image.
-     * @param sizeY The height in pixels of the image.
-     * @param format Either YV12 or NV21.
-     * @return The converted image.
-     */
-    fun yuvToRgbBitmap(inputArray: ByteArray, sizeX: Int, sizeY: Int, format: YuvFormat): Bitmap {
-        require(sizeX % 2 == 0 && sizeY % 2 == 0) {
-            "$externalName yuvToRgbBitmap. Non-even dimensions are not supported. " +
-                    "$sizeX and $sizeY were provided."
-        }
-
-        val outputBitmap = Bitmap.createBitmap(sizeX, sizeY, Bitmap.Config.ARGB_8888)
-        nativeYuvToRgbBitmap(nativeHandle, inputArray, sizeX, sizeY, outputBitmap, format.value)
-        return outputBitmap
-    }
-
-    init {
-        System.loadLibrary("renderscript-toolkit")
-        nativeHandle = createNative()
-    }
-
-    /**
-     * Shutdown the thread pool.
-     *
-     * Waits for the threads to complete their work and destroys them.
-     *
-     * An application should call this method only if it is sure that it won't call the
-     * toolkit again, as it is irreversible.
-     */
-    fun shutdown() {
-        destroyNative(nativeHandle)
-        nativeHandle = 0
-    }
-
-    private var nativeHandle: Long = 0
-
-    private external fun createNative(): Long
-
-    private external fun destroyNative(nativeHandle: Long)
-
-    private external fun nativeBlend(
-        nativeHandle: Long,
-        mode: Int,
-        sourceArray: ByteArray,
-        destArray: ByteArray,
-        sizeX: Int,
-        sizeY: Int,
-        restriction: Range2d?
-    )
-
-    private external fun nativeBlendBitmap(
-        nativeHandle: Long,
-        mode: Int,
-        sourceBitmap: Bitmap,
-        destBitmap: Bitmap,
-        restriction: Range2d?
-    )
-
-    private external fun nativeBlur(
-        nativeHandle: Long,
-        inputArray: ByteArray,
-        vectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        radius: Int,
-        outputArray: ByteArray,
-        restriction: Range2d?
-    )
-
-    private external fun nativeBlurBitmap(
-        nativeHandle: Long,
-        inputBitmap: Bitmap,
-        outputBitmap: Bitmap,
-        radius: Int,
-        restriction: Range2d?
-    )
-
-    private external fun nativeColorMatrix(
-        nativeHandle: Long,
-        inputArray: ByteArray,
-        inputVectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        outputArray: ByteArray,
-        outputVectorSize: Int,
-        matrix: FloatArray,
-        addVector: FloatArray,
-        restriction: Range2d?
-    )
-
-    private external fun nativeColorMatrixBitmap(
-        nativeHandle: Long,
-        inputBitmap: Bitmap,
-        outputBitmap: Bitmap,
-        matrix: FloatArray,
-        addVector: FloatArray,
-        restriction: Range2d?
-    )
-
-    private external fun nativeConvolve(
-        nativeHandle: Long,
-        inputArray: ByteArray,
-        vectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        outputArray: ByteArray,
-        coefficients: FloatArray,
-        restriction: Range2d?
-    )
-
-    private external fun nativeConvolveBitmap(
-        nativeHandle: Long,
-        inputBitmap: Bitmap,
-        outputBitmap: Bitmap,
-        coefficients: FloatArray,
-        restriction: Range2d?
-    )
-
-    private external fun nativeHistogram(
-        nativeHandle: Long,
-        inputArray: ByteArray,
-        vectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        outputArray: IntArray,
-        restriction: Range2d?
-    )
-
-    private external fun nativeHistogramBitmap(
-        nativeHandle: Long,
-        inputBitmap: Bitmap,
-        outputArray: IntArray,
-        restriction: Range2d?
-    )
-
-    private external fun nativeHistogramDot(
-        nativeHandle: Long,
-        inputArray: ByteArray,
-        vectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        outputArray: IntArray,
-        coefficients: FloatArray,
-        restriction: Range2d?
-    )
-
-    private external fun nativeHistogramDotBitmap(
-        nativeHandle: Long,
-        inputBitmap: Bitmap,
-        outputArray: IntArray,
-        coefficients: FloatArray,
-        restriction: Range2d?
-    )
-
-    private external fun nativeLut(
-        nativeHandle: Long,
-        inputArray: ByteArray,
-        outputArray: ByteArray,
-        sizeX: Int,
-        sizeY: Int,
-        red: ByteArray,
-        green: ByteArray,
-        blue: ByteArray,
-        alpha: ByteArray,
-        restriction: Range2d?
-    )
-
-    private external fun nativeLutBitmap(
-        nativeHandle: Long,
-        inputBitmap: Bitmap,
-        outputBitmap: Bitmap,
-        red: ByteArray,
-        green: ByteArray,
-        blue: ByteArray,
-        alpha: ByteArray,
-        restriction: Range2d?
-    )
-
-    private external fun nativeLut3d(
-        nativeHandle: Long,
-        inputArray: ByteArray,
-        outputArray: ByteArray,
-        sizeX: Int,
-        sizeY: Int,
-        cube: ByteArray,
-        cubeSizeX: Int,
-        cubeSizeY: Int,
-        cubeSizeZ: Int,
-        restriction: Range2d?
-    )
-
-    private external fun nativeLut3dBitmap(
-        nativeHandle: Long,
-        inputBitmap: Bitmap,
-        outputBitmap: Bitmap,
-        cube: ByteArray,
-        cubeSizeX: Int,
-        cubeSizeY: Int,
-        cubeSizeZ: Int,
-        restriction: Range2d?
-    )
-
-    private external fun nativeResize(
-        nativeHandle: Long,
-        inputArray: ByteArray,
-        vectorSize: Int,
-        inputSizeX: Int,
-        inputSizeY: Int,
-        outputArray: ByteArray,
-        outputSizeX: Int,
-        outputSizeY: Int,
-        restriction: Range2d?
-    )
-
-    private external fun nativeResizeBitmap(
-        nativeHandle: Long,
-        inputBitmap: Bitmap,
-        outputBitmap: Bitmap,
-        restriction: Range2d?
-    )
-
-    private external fun nativeYuvToRgb(
-        nativeHandle: Long,
-        inputArray: ByteArray,
-        outputArray: ByteArray,
-        sizeX: Int,
-        sizeY: Int,
-        format: Int
-    )
-
-    private external fun nativeYuvToRgbBitmap(
-        nativeHandle: Long,
-        inputArray: ByteArray,
-        sizeX: Int,
-        sizeY: Int,
-        outputBitmap: Bitmap,
-        value: Int
-    )
-}
-
-/**
- * Determines how a source buffer is blended into a destination buffer.
- * See {@link RenderScriptToolkit::blend}.
- *
- * blend only works on 4 byte RGBA data. In the descriptions below, ".a" represents
- * the alpha channel.
- */
-enum class BlendingMode(val value: Int) {
-    /**
-     * dest = 0
-     *
-     * The destination is cleared, i.e. each pixel is set to (0, 0, 0, 0)
-     */
-    CLEAR(0),
-
-    /**
-     * dest = src
-     *
-     * Sets each pixel of the destination to the corresponding one in the source.
-     */
-    SRC(1),
-
-    /**
-     * dest = dest
-     *
-     * Leaves the destination untouched. This is a no-op.
-     */
-    DST(2),
-
-    /**
-     * dest = src + dest * (1.0 - src.a)
-     */
-    SRC_OVER(3),
-
-    /**
-     * dest = dest + src * (1.0 - dest.a)
-     */
-    DST_OVER(4),
-
-    /**
-     * dest = src * dest.a
-     */
-    SRC_IN(5),
-
-    /**
-     * dest = dest * src.a
-     */
-    DST_IN(6),
-
-    /**
-     * dest = src * (1.0 - dest.a)
-     */
-    SRC_OUT(7),
-
-    /**
-     * dest = dest * (1.0 - src.a)
-     */
-    DST_OUT(8),
-
-    /**
-     * dest.rgb = src.rgb * dest.a + (1.0 - src.a) * dest.rgb, dest.a = dest.a
-     */
-    SRC_ATOP(9),
-
-    /**
-     * dest = dest.rgb * src.a + (1.0 - dest.a) * src.rgb, dest.a = src.a
-     */
-    DST_ATOP(10),
-
-    /**
-     * dest = {src.r ^ dest.r, src.g ^ dest.g, src.b ^ dest.b, src.a ^ dest.a}
-     *
-     * Note: this is NOT the Porter/Duff XOR mode; this is a bitwise xor.
-     */
-    XOR(11),
-
-    /**
-     * dest = src * dest
-     */
-    MULTIPLY(12),
-
-    /**
-     * dest = min(src + dest, 1.0)
-     */
-    ADD(13),
-
-    /**
-     * dest = max(dest - src, 0.0)
-     */
-    SUBTRACT(14)
-}
-
-/**
- * A translation table used by the lut method. For each potential red, green, blue, and alpha
- * value, specifies it's replacement value.
- *
- * The fields are initialized to be a no-op operation, i.e. replace 1 by 1, 2 by 2, etc.
- * You can modify just the values you're interested in having a translation.
- */
-class LookupTable {
-    var red = ByteArray(256) { it.toByte() }
-    var green = ByteArray(256) { it.toByte() }
-    var blue = ByteArray(256) { it.toByte() }
-    var alpha = ByteArray(256) { it.toByte() }
-}
-
-/**
- * The YUV formats supported by yuvToRgb.
- */
-enum class YuvFormat(val value: Int) {
-    NV21(0x11),
-    YV12(0x32315659),
-}
-
-/**
- * Define a range of data to process.
- *
- * This class is used to restrict a [Toolkit] operation to a rectangular subset of the input
- * tensor.
- *
- * @property startX The index of the first value to be included on the X axis.
- * @property endX The index after the last value to be included on the X axis.
- * @property startY The index of the first value to be included on the Y axis.
- * @property endY The index after the last value to be included on the Y axis.
- */
-data class Range2d(
-    val startX: Int,
-    val endX: Int,
-    val startY: Int,
-    val endY: Int
-) {
-    constructor() : this(0, 0, 0, 0)
-}
-
-class Rgba3dArray(val values: ByteArray, val sizeX: Int, val sizeY: Int, val sizeZ: Int) {
-    init {
-        require(values.size >= sizeX * sizeY * sizeZ * 4)
-    }
-
-    operator fun get(x: Int, y: Int, z: Int): ByteArray {
-        val index = indexOfVector(x, y, z)
-        return ByteArray(4) { values[index + it] }
-    }
-
-    operator fun set(x: Int, y: Int, z: Int, value: ByteArray) {
-        require(value.size == 4)
-        val index = indexOfVector(x, y, z)
-        for (i in 0..3) {
-            values[index + i] = value[i]
-        }
-    }
-
-    private fun indexOfVector(x: Int, y: Int, z: Int): Int {
-        require(x in 0 until sizeX)
-        require(y in 0 until sizeY)
-        require(z in 0 until sizeZ)
-        return ((z * sizeY + y) * sizeX + x) * 4
-    }
-}
-
-internal fun validateBitmap(
-    function: String,
-    inputBitmap: Bitmap,
-    alphaAllowed: Boolean = true
-) {
-    if (alphaAllowed) {
-        require(
-            inputBitmap.config == Bitmap.Config.ARGB_8888 ||
-                    inputBitmap.config == Bitmap.Config.ALPHA_8
-        ) {
-            "$externalName. $function supports only ARGB_8888 and ALPHA_8 bitmaps. " +
-                    "${inputBitmap.config} provided."
-        }
-    } else {
-        require(inputBitmap.config == Bitmap.Config.ARGB_8888) {
-            "$externalName. $function supports only ARGB_8888. " +
-                    "${inputBitmap.config} provided."
-        }
-    }
-    require(inputBitmap.width * vectorSize(inputBitmap) == inputBitmap.rowBytes) {
-        "$externalName $function. Only bitmaps with rowSize equal to the width * vectorSize are " +
-                "currently supported. Provided were rowBytes=${inputBitmap.rowBytes}, " +
-                "width={${inputBitmap.width}, and vectorSize=${vectorSize(inputBitmap)}."
-    }
-}
-
-internal fun createCompatibleBitmap(inputBitmap: Bitmap) =
-    Bitmap.createBitmap(inputBitmap.width, inputBitmap.height, inputBitmap.config)
-
-internal fun validateHistogramDotCoefficients(
-    coefficients: FloatArray?,
-    vectorSize: Int
-) {
-    require(coefficients == null || coefficients.size == vectorSize) {
-        "$externalName histogramDot. The coefficients should be null or have $vectorSize values."
-    }
-    if (coefficients !== null) {
-        var sum = 0f
-        for (i in 0 until vectorSize) {
-            require(coefficients[i] >= 0.0f) {
-                "$externalName histogramDot. Coefficients should not be negative. " +
-                        "Coefficient $i was ${coefficients[i]}."
-            }
-            sum += coefficients[i]
-        }
-        require(sum <= 1.0f) {
-            "$externalName histogramDot. Coefficients should add to 1 or less. Their sum is $sum."
-        }
-    }
-}
-
-internal fun validateRestriction(tag: String, bitmap: Bitmap, restriction: Range2d? = null) {
-    validateRestriction(tag, bitmap.width, bitmap.height, restriction)
-}
-
-internal fun validateRestriction(
-    tag: String,
-    sizeX: Int,
-    sizeY: Int,
-    restriction: Range2d? = null
-) {
-    if (restriction == null) return
-    require(restriction.startX < sizeX && restriction.endX <= sizeX) {
-        "$externalName $tag. sizeX should be greater than restriction.startX and greater " +
-                "or equal to restriction.endX. $sizeX, ${restriction.startX}, " +
-                "and ${restriction.endX} were provided respectively."
-    }
-    require(restriction.startY < sizeY && restriction.endY <= sizeY) {
-        "$externalName $tag. sizeY should be greater than restriction.startY and greater " +
-                "or equal to restriction.endY. $sizeY, ${restriction.startY}, " +
-                "and ${restriction.endY} were provided respectively."
-    }
-    require(restriction.startX < restriction.endX) {
-        "$externalName $tag. Restriction startX should be less than endX. " +
-                "${restriction.startX} and ${restriction.endX} were provided respectively."
-    }
-    require(restriction.startY < restriction.endY) {
-        "$externalName $tag. Restriction startY should be less than endY. " +
-                "${restriction.startY} and ${restriction.endY} were provided respectively."
-    }
-}
-
-internal fun vectorSize(bitmap: Bitmap): Int {
-    return when (bitmap.config) {
-        Bitmap.Config.ARGB_8888 -> 4
-        Bitmap.Config.ALPHA_8 -> 1
-        else -> throw IllegalArgumentException(
-            "$externalName. Only ARGB_8888 and ALPHA_8 Bitmap are supported."
-        )
-    }
-}
-
-internal fun paddedSize(vectorSize: Int) = if (vectorSize == 3) 4 else vectorSize

diff --git a/toolkit/test/AllTests.kt b/toolkit/test/AllTests.kt
deleted file mode 100644
index 5833795..0000000
--- a/toolkit/test/AllTests.kt
+++ /dev/null

@@ -1,1244 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// TODO Rename to something better
-package com.example.testapp
-
-import android.content.Context
-import android.graphics.Bitmap
-import android.graphics.BitmapFactory
-import android.renderscript.RenderScript
-import android.renderscript.toolkit.BlendingMode
-import android.renderscript.toolkit.LookupTable
-import android.renderscript.toolkit.Range2d
-import android.renderscript.toolkit.Rgba3dArray
-import android.renderscript.toolkit.Toolkit
-import android.renderscript.toolkit.YuvFormat
-import kotlin.math.abs
-import kotlin.math.min
-
-data class TestLayout(
-    val sizeX: Int,
-    val sizeY: Int,
-    val restriction: Range2d?
-)
-
-// List of dimensions (sizeX, sizeY) to try when generating random data.
-val commonLayoutsToTry = listOf(
-    // Small layouts to start with
-    TestLayout(3, 4, null),
-    TestLayout(3, 4, Range2d(0, 1, 0, 3)),
-    TestLayout(3, 4, Range2d(2, 3, 1, 4)),
-    TestLayout(10, 14, null),
-    TestLayout(10, 14, Range2d(2, 3, 8, 14)),
-    // The size of most CTS intrinsic tests
-    TestLayout(160, 100, null),
-    TestLayout(125, 227, Range2d(50, 125, 100, 227)),
-    // A larger one
-    TestLayout(800, 600, null),
-    // Weirdly shaped ones
-    TestLayout(1, 1, null), // A single item
-    // TODO This size makes Intrinsic Blur fail.
-    TestLayout(16000, 1, null), // A single item
-    TestLayout(1, 16000, null), // One large row
-    // A very large test
-    TestLayout(1024, 2048, null),
-)
-
-
-class Tester(context: Context, private val validate: Boolean) {
-    private val renderscriptContext = RenderScript.create(context)
-    private val toolkit = Toolkit()
-    private val testImage1 = BitmapFactory.decodeResource(context.resources, R.drawable.img800x450a)
-    private val testImage2 = BitmapFactory.decodeResource(context.resources, R.drawable.img800x450b)
-
-    init {
-        validateTestImage(testImage1)
-        validateTestImage(testImage2)
-    }
-
-    /**
-     * Verify that the test images are in format that works for our tests.
-     */
-    private fun validateTestImage(bitmap: Bitmap) {
-        require(bitmap.config == Bitmap.Config.ARGB_8888)
-        require(bitmap.rowBytes == bitmap.width * 4) {
-            "Can't handle bitmaps that have extra padding. " +
-                "${bitmap.rowBytes} != ${bitmap.width} * 4." }
-        require(bitmap.byteCount == bitmap.rowBytes * bitmap.height)
-    }
-
-    fun destroy() {
-        renderscriptContext.destroy()
-    }
-
-    @ExperimentalUnsignedTypes
-    fun testAll(timer: TimingTracker): String {
-        val tests  = listOf(
-            Pair("blend", ::testBlend),
-            Pair("blur", ::testBlur),
-            Pair("colorMatrix", ::testColorMatrix),
-            Pair("convolve", ::testConvolve),
-            Pair("histogram", ::testHistogram),
-            Pair("lut", ::testLut),
-            Pair("lut3d", ::testLut3d),
-            Pair("resize", ::testResize),
-            Pair("yuvToRgb", ::testYuvToRgb),
-        )
-        val results = Array(tests.size) { "" }
-        for (i in tests.indices) {
-            val (name, test) = tests[i]
-            println("Doing $name")
-            val success = test(timer)
-            results[i] = "$name " + if (success) "succeeded" else "FAILED! FAILED! FAILED! FAILED!"
-            println("      ${results[i]}")
-        }
-
-        return results.joinToString("\n")
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testBlend(timer: TimingTracker): Boolean {
-        return BlendingMode.values().all { mode ->
-            testOneBitmapBlend(timer, testImage1, testImage2, mode, null) and
-                    testOneBitmapBlend(
-                        timer, testImage1, testImage2, mode,
-                        Range2d(6, 23, 2, 4)
-                    ) and
-                    commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
-                        testOneRandomBlend(timer, sizeX, sizeY, mode, restriction)
-                    }
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneRandomBlend(
-        timer: TimingTracker,
-        sizeX: Int,
-        sizeY: Int,
-        mode: BlendingMode,
-        restriction: Range2d?
-    ): Boolean {
-        val sourceArray = randomByteArray(0x50521f0, sizeX, sizeY, 4)
-        val destArray = randomByteArray(0x2932147, sizeX, sizeY, 4)
-        // Make clones because these will be modified by the blend.
-        val intrinsicDestArray = destArray.clone()
-        val referenceDestArray = destArray.clone()
-        val toolkitDestArray = destArray.clone()
-
-        timer.measure("IntrinsicBlend") {
-            intrinsicBlend(
-                renderscriptContext, mode, sourceArray, intrinsicDestArray, sizeX, sizeY,
-                restriction
-            )
-        }
-        timer.measure("ToolkitBlend") {
-            toolkit.blend(mode, sourceArray, toolkitDestArray, sizeX, sizeY, restriction)
-        }
-        if (!validate) return true
-
-        timer.measure("ReferenceBlend") {
-            referenceBlend(mode, sourceArray, referenceDestArray, sizeX, sizeY, restriction)
-        }
-
-        return validateSame(
-            "Blend_$mode", intrinsicDestArray, referenceDestArray, toolkitDestArray
-        ) {
-            println("blend $mode ($sizeX, $sizeY) $restriction")
-            logArray("Blend_$mode src", sourceArray, 48)
-            logArray("Blend_$mode dst", destArray, 48)
-            logArray("Blend_$mode reference out", referenceDestArray, 48)
-            logArray("Blend_$mode intrinsic out", intrinsicDestArray, 48)
-            logArray("Blend_$mode toolkit   out", toolkitDestArray, 48)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneBitmapBlend(
-        timer: TimingTracker,
-        sourceBitmap: Bitmap,
-        destBitmap: Bitmap,
-        mode: BlendingMode,
-        restriction: Range2d?
-    ): Boolean {
-        // Make clones because these will be modified by the blend.
-        val intrinsicDestBitmap = duplicateBitmap(destBitmap)
-        val toolkitDestBitmap = duplicateBitmap(destBitmap)
-        val referenceDestBitmap = duplicateBitmap(destBitmap)
-
-        timer.measure("IntrinsicBlend") {
-            intrinsicBlend(
-                renderscriptContext, mode, sourceBitmap, intrinsicDestBitmap, restriction
-            )
-        }
-        timer.measure("ToolkitBlend") {
-            toolkit.blend(mode, sourceBitmap, toolkitDestBitmap, restriction)
-        }
-        if (!validate) return true
-
-        val referenceDestArray = getBitmapBytes(referenceDestBitmap)
-        timer.measure("ReferenceBlend") {
-            referenceBlend(
-                mode, getBitmapBytes(sourceBitmap), referenceDestArray, sourceBitmap.width,
-                sourceBitmap.height, restriction
-            )
-        }
-
-        val intrinsicDestArray = getBitmapBytes(intrinsicDestBitmap)
-        val toolkitDestArray = getBitmapBytes(toolkitDestBitmap)
-        return validateSame(
-            "BlendBitmap_$mode", intrinsicDestArray, referenceDestArray, toolkitDestArray
-        ) {
-            println("BlendBitmap $mode $restriction")
-            //logArray("BlendBitmap_$mode src", sourceArray, 48)
-            //logArray("BlendBitmap_$mode dst", destArray, 48)
-            logArray("BlendBitmap_$mode reference out", referenceDestArray, 48)
-            logArray("BlendBitmap_$mode intrinsic out", intrinsicDestArray, 48)
-            logArray("BlendBitmap_$mode toolkit   out", toolkitDestArray, 48)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testBlur(timer: TimingTracker): Boolean {
-        return arrayOf(1, 3, 8, 25).all { radius ->
-            testOneBitmapBlur(timer, testImage1, radius, null) and
-                    testOneBitmapBlur(timer, testImage1, radius, Range2d(6, 23, 2, 4)) and
-                    commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
-                        arrayOf(1, 4).all { vectorSize ->
-                            testOneRandomBlur(timer, vectorSize, sizeX, sizeY, radius, restriction)
-                        }
-                    }
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneRandomBlur(
-        timer: TimingTracker,
-        vectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        radius: Int,
-        restriction: Range2d?
-    ): Boolean {
-        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, vectorSize)
-        val intrinsicOutArray = timer.measure("IntrinsicBlur") {
-            intrinsicBlur(
-                renderscriptContext, inputArray, vectorSize, sizeX, sizeY, radius, restriction
-            )
-        }
-        val toolkitOutArray = timer.measure("ToolkitBlur") {
-            toolkit.blur(inputArray, vectorSize, sizeX, sizeY, radius, restriction)
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceBlur") {
-            referenceBlur(inputArray, vectorSize, sizeX, sizeY, radius, restriction)
-        }
-        return validateSame("blur", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
-            println("blur $vectorSize ($sizeX, $sizeY) radius = $radius $restriction")
-            logArray("blur input        ", inputArray)
-            logArray("blur reference out", referenceOutArray)
-            logArray("blur intrinsic out", intrinsicOutArray)
-            logArray("blur toolkit   out", toolkitOutArray)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneBitmapBlur(
-        timer: TimingTracker,
-        bitmap: Bitmap,
-        radius: Int,
-        restriction: Range2d?
-    ): Boolean {
-        val intrinsicOutArray = timer.measure("IntrinsicBlur") {
-            intrinsicBlur(renderscriptContext, bitmap, radius, restriction)
-        }
-
-        val toolkitOutBitmap = timer.measure("ToolkitBlur") {
-            toolkit.blur(bitmap, radius, restriction)
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceBlur") {
-            referenceBlur(
-                getBitmapBytes(bitmap),
-                vectorSizeOfBitmap(bitmap),
-                bitmap.width,
-                bitmap.height,
-                radius,
-                restriction
-            )
-        }
-
-        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
-        return validateSame("blur", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
-            println("BlurBitmap ${bitmap.config} $radius $restriction")
-            logArray("blur reference out", referenceOutArray)
-            logArray("blur intrinsic out", intrinsicOutArray)
-            logArray("blur toolkit   out", toolkitOutArray)
-        }
-    }
-
-    enum class ColorMatrixConversionType {
-        RGB_TO_YUV,
-        YUV_TO_RGB,
-        GREYSCALE,
-        RANDOM
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testColorMatrix(timer: TimingTracker): Boolean {
-        return ColorMatrixConversionType.values().all { conversion ->
-            testOneBitmapColorMatrix(timer, testImage1, conversion, null) and
-                    testOneBitmapColorMatrix(
-                        timer,
-                        testImage1,
-                        conversion,
-                        Range2d(6, 23, 2, 4)
-                    ) and
-                    commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
-                        (1..4).all { inputVectorSize ->
-                            (1..4).all { outputVectorSize ->
-                                testOneRandomColorMatrix(
-                                    timer,
-                                    inputVectorSize,
-                                    sizeX,
-                                    sizeY,
-                                    outputVectorSize,
-                                    conversion,
-                                    restriction
-                                )
-                            }
-                        }
-                    }
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneRandomColorMatrix(
-        timer: TimingTracker,
-        inputVectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        outputVectorSize: Int,
-        conversion: ColorMatrixConversionType,
-        restriction: Range2d?
-    ): Boolean {
-        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, paddedSize(inputVectorSize))
-        val addVector = randomFloatArray(0x243238, 4, 1, 1, 0.3f)
-        val matrix = when (conversion) {
-            ColorMatrixConversionType.RGB_TO_YUV -> toolkit.rgbToYuvMatrix
-            ColorMatrixConversionType.YUV_TO_RGB -> toolkit.yuvToRgbMatrix
-            ColorMatrixConversionType.GREYSCALE -> toolkit.greyScaleColorMatrix
-            ColorMatrixConversionType.RANDOM -> randomFloatArray(0x234348, 4, 4, 1)
-        }
-
-        val intrinsicOutArray = timer.measure("IntrinsicColorMatrix") {
-            intrinsicColorMatrix(
-                renderscriptContext,
-                conversion,
-                inputArray,
-                inputVectorSize,
-                sizeX,
-                sizeY,
-                outputVectorSize,
-                matrix,
-                addVector,
-                restriction
-            )
-        }
-        val toolkitOutArray = timer.measure("ToolkitColorMatrix") {
-            toolkit.colorMatrix(
-                inputArray,
-                inputVectorSize,
-                sizeX,
-                sizeY,
-                outputVectorSize,
-                matrix,
-                addVector,
-                restriction
-            )
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceColorMatrix") {
-            referenceColorMatrix(
-                inputArray, inputVectorSize, sizeX, sizeY, outputVectorSize, matrix, addVector,
-                restriction
-            )
-        }
-
-        return validateSame("colorMatrix", intrinsicOutArray, referenceOutArray, toolkitOutArray,
-            outputVectorSize == 3) {
-            println("colorMatrix ($sizeX, $sizeY) $inputVectorSize->$outputVectorSize $restriction")
-            logArray("colorMatrix matrix   ", matrix, 16)
-            logArray("colorMatrix addVector", addVector, 4)
-            logArray("colorMatrix in           ", inputArray)
-            logArray("colorMatrix reference out", referenceOutArray, 300)
-            logArray("colorMatrix intrinsic out", intrinsicOutArray, 300)
-            logArray("colorMatrix toolkit   out", toolkitOutArray, 300)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneBitmapColorMatrix(
-        timer: TimingTracker,
-        bitmap: Bitmap,
-        conversion: ColorMatrixConversionType,
-        restriction: Range2d?
-    ): Boolean {
-        val addVector = randomFloatArray(0x243238, 4, 1, 1, 0.3f)
-        val matrix = when (conversion) {
-            ColorMatrixConversionType.RGB_TO_YUV -> toolkit.rgbToYuvMatrix
-            ColorMatrixConversionType.YUV_TO_RGB -> toolkit.yuvToRgbMatrix
-            ColorMatrixConversionType.GREYSCALE -> toolkit.greyScaleColorMatrix
-            ColorMatrixConversionType.RANDOM -> randomFloatArray(0x234348, 4, 4, 1)
-        }
-
-        val intrinsicOutArray = timer.measure("IntrinsicColorMatrix") {
-            intrinsicColorMatrix(
-                renderscriptContext, conversion, bitmap, matrix, addVector, restriction
-            )
-        }
-        val toolkitOutBitmap = timer.measure("ToolkitColorMatrix") {
-            toolkit.colorMatrix(bitmap, matrix, addVector, restriction)
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceColorMatrix") {
-            referenceColorMatrix(
-                getBitmapBytes(bitmap), vectorSizeOfBitmap(bitmap), bitmap.width, bitmap.height,
-                vectorSizeOfBitmap(bitmap), matrix, addVector, restriction
-            )
-        }
-
-        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
-        return validateSame("ColorMatrix", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
-            println("colorMatrixBitmap $restriction")
-            logArray("colorMatrixBitmap matrix   ", matrix, 16)
-            logArray("colorMatrixBitmap addVector", addVector, 4)
-            logArray("colorMatrixBitmap reference out", referenceOutArray)
-            logArray("colorMatrixBitmap intrinsic out", intrinsicOutArray)
-            logArray("colorMatrixBitmap toolkit   out", toolkitOutArray)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testConvolve(timer: TimingTracker): Boolean {
-        val coefficientsToTry = listOf(
-            randomFloatArray(0x2937021, 3, 3, 1, 0.1f),
-            randomFloatArray(0x2937021, 5, 5, 1, 0.05f)
-        )
-        return coefficientsToTry.all { coefficients ->
-            testOneBitmapConvolve(timer, testImage1, coefficients, null) and
-                    testOneBitmapConvolve(timer, testImage1, coefficients, Range2d(6, 23, 2, 4)) and
-
-                    commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
-                        (1..4).all { vectorSize ->
-                            testOneRandomConvolve(
-                                timer,
-                                vectorSize,
-                                sizeX,
-                                sizeY,
-                                coefficients,
-                                restriction
-                            )
-                        }
-                    }
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneRandomConvolve(
-        timer: TimingTracker,
-        vectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        coefficients: FloatArray,
-        restriction: Range2d?
-    ): Boolean {
-        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, paddedSize(vectorSize))
-
-        val intrinsicOutArray = timer.measure("IntrinsicConvolve") {
-            intrinsicConvolve(
-                renderscriptContext, inputArray, vectorSize, sizeX, sizeY, coefficients, restriction
-            )
-        }
-        val toolkitOutArray = timer.measure("ToolkitConvolve") {
-            toolkit.convolve(inputArray, vectorSize, sizeX, sizeY, coefficients, restriction)
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceConvolve") {
-            referenceConvolve(inputArray, vectorSize, sizeX, sizeY, coefficients, restriction)
-        }
-
-        val task = if (coefficients.size == 9) "convolve3x3 $vectorSize" else "convolve5x5 $vectorSize"
-        return validateSame(task, intrinsicOutArray, referenceOutArray, toolkitOutArray) {
-            println("Convolve $vectorSize ($sizeX, $sizeY) $restriction")
-            logArray("Convolve coefficients", coefficients, 25)
-            logArray("Convolve in           ", inputArray)
-            logArray("Convolve reference out", referenceOutArray)
-            logArray("Convolve intrinsic out", intrinsicOutArray)
-            logArray("Convolve toolkit   out", toolkitOutArray)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneBitmapConvolve(
-        timer: TimingTracker,
-        bitmap: Bitmap,
-        coefficients: FloatArray,
-        restriction: Range2d?
-    ): Boolean {
-        val intrinsicOutArray = timer.measure("IntrinsicConvolve") {
-            intrinsicConvolve(renderscriptContext, bitmap, coefficients, restriction)
-        }
-        val toolkitOutBitmap = timer.measure("ToolkitConvolve") {
-            toolkit.convolve(bitmap, coefficients, restriction)
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceConvolve") {
-            referenceConvolve(
-                getBitmapBytes(bitmap), vectorSizeOfBitmap(bitmap), bitmap.width, bitmap.height,
-                coefficients, restriction
-            )
-        }
-
-        val task = if (coefficients.size == 9) "convolve3x3" else "convolve5x5"
-        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
-        return validateSame(task, intrinsicOutArray, referenceOutArray, toolkitOutArray) {
-            println("ConvolveBitmap $restriction")
-            logArray("ConvolveBitmap coefficients", coefficients, 25)
-            //logArray("ConvolveBitmap in           ", inputArray)
-            logArray("ConvolveBitmap reference out", referenceOutArray)
-            logArray("ConvolveBitmap intrinsic out", intrinsicOutArray)
-            logArray("ConvolveBitmap toolkit   out", toolkitOutArray)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testHistogram(timer: TimingTracker): Boolean {
-        val coefficients = floatArrayOf(0.1f, 0.3f, 0.5f, 0.05f)
-        return testOneBitmapHistogram(timer, testImage1, null) and
-                testOneBitmapHistogram(timer, testImage1, Range2d(6, 23, 2, 4)) and
-                testOneBitmapHistogramDot(timer, testImage1, null, null) and
-                testOneBitmapHistogramDot(timer, testImage1, coefficients, null) and
-                testOneBitmapHistogramDot(timer, testImage1, coefficients, Range2d(6, 23, 2, 4)) and
-        commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
-            (1..4).all { vectorSize ->
-                testOneRandomHistogram(timer, vectorSize, sizeX, sizeY, restriction) &&
-                        testOneRandomHistogramDot(
-                            timer,
-                            vectorSize,
-                            sizeX,
-                            sizeY,
-                            null,
-                            restriction
-                        ) &&
-                        testOneRandomHistogramDot(
-                            timer,
-                            vectorSize,
-                            sizeX,
-                            sizeY,
-                            coefficients.sliceArray(0 until vectorSize),
-                            restriction
-                        )
-            }
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneRandomHistogram(
-        timer: TimingTracker,
-        vectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        restriction: Range2d?
-    ): Boolean {
-        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, paddedSize(vectorSize))
-
-        val intrinsicOutput = timer.measure("IntrinsicHistogram") {
-            intrinsicHistogram(
-                renderscriptContext, inputArray, vectorSize, sizeX, sizeY, restriction
-            )
-        }
-        val toolkitOutput = timer.measure("ToolkitHistogram") {
-            toolkit.histogram(inputArray, vectorSize, sizeX, sizeY, restriction)
-        }
-        if (!validate) return true
-
-        val referenceOutput = timer.measure("ReferenceHistogram") {
-            referenceHistogram(
-                inputArray, vectorSize, sizeX, sizeY, restriction
-            )
-        }
-
-        return validateSame("histogram", intrinsicOutput, referenceOutput, toolkitOutput, 0) {
-            println("histogram $vectorSize ($sizeX, $sizeY) $restriction")
-            logArray("histogram in           ", inputArray, 200)
-            logArray("histogram reference out", referenceOutput, 200)
-            logArray("histogram intrinsic out", intrinsicOutput, 200)
-            logArray("histogram toolkit   out", toolkitOutput, 200)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneBitmapHistogram(
-        timer: TimingTracker,
-        bitmap: Bitmap,
-        restriction: Range2d?
-    ): Boolean {
-        val intrinsicOutput = timer.measure("IntrinsicHistogram") {
-            intrinsicHistogram(renderscriptContext, bitmap, restriction)
-        }
-        val toolkitOutput = timer.measure("ToolkitHistogram") {
-            toolkit.histogram(bitmap, restriction)
-        }
-        if (!validate) return true
-
-        val referenceOutput = timer.measure("ReferenceHistogram") {
-            referenceHistogram(
-                getBitmapBytes(bitmap), vectorSizeOfBitmap(bitmap), bitmap.width, bitmap.height,
-                restriction
-            )
-        }
-
-        return validateSame("histogram", intrinsicOutput, referenceOutput, toolkitOutput, 0) {
-            println("HistogramBitmap $restriction")
-            logArray("HistogramBitmap reference out", referenceOutput)
-            logArray("HistogramBitmap intrinsic out", intrinsicOutput)
-            logArray("HistogramBitmap toolkit   out", toolkitOutput)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneRandomHistogramDot(
-        timer: TimingTracker,
-        vectorSize: Int,
-        sizeX: Int,
-        sizeY: Int,
-        coefficients: FloatArray?, restriction: Range2d?
-    ): Boolean {
-        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, paddedSize(vectorSize))
-
-        val intrinsicOutArray = timer.measure("IntrinsicHistogramDot") {
-            intrinsicHistogramDot(
-                renderscriptContext, inputArray, vectorSize, sizeX, sizeY, coefficients, restriction
-            )
-        }
-        val toolkitOutArray = timer.measure("ToolkitHistogramDot") {
-            toolkit.histogramDot(
-                inputArray, vectorSize, sizeX, sizeY, coefficients, restriction
-            )
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceHistogramDot") {
-            referenceHistogramDot(inputArray, vectorSize, sizeX, sizeY, coefficients, restriction)
-        }
-
-        return validateSame("histogramDot", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
-            println("histogramDot $vectorSize ($sizeX, $sizeY) $restriction")
-            logArray("histogramDot coefficients ", coefficients)
-            logArray("histogramDot in           ", inputArray)
-            logArray("histogramDot reference out", referenceOutArray, 256)
-            logArray("histogramDot intrinsic out", intrinsicOutArray, 256)
-            logArray("histogramDot toolkit   out", toolkitOutArray, 256)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneBitmapHistogramDot(
-        timer: TimingTracker,
-        bitmap: Bitmap,
-        coefficients: FloatArray?,
-        restriction: Range2d?
-    ): Boolean {
-        val intrinsicOutArray = timer.measure("IntrinsicHistogramDot") {
-            intrinsicHistogramDot(renderscriptContext, bitmap, coefficients, restriction)
-        }
-        val toolkitOutArray = timer.measure("ToolkitHistogramDot") {
-            toolkit.histogramDot(bitmap, coefficients, restriction)
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceHistogramDot") {
-            referenceHistogramDot(
-                getBitmapBytes(bitmap),
-                vectorSizeOfBitmap(bitmap),
-                bitmap.width,
-                bitmap.height,
-                coefficients,
-                restriction
-            )
-        }
-
-        return validateSame(
-            "HistogramDotBitmap",
-            intrinsicOutArray,
-            referenceOutArray,
-            toolkitOutArray
-        ) {
-            println("HistogramDotBitmap $restriction")
-            logArray("HistogramDotBitmap coefficients ", coefficients)
-            //logArray("HistogramDotBitmap in           ", inputArray)
-            logArray("HistogramDotBitmap reference out", referenceOutArray, 256)
-            logArray("HistogramDotBitmap intrinsic out", intrinsicOutArray, 256)
-            logArray("HistogramDotBitmap toolkit   out", toolkitOutArray, 256)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testLut(timer: TimingTracker): Boolean {
-        return testOneBitmapLut(timer, testImage1, null) and
-                testOneBitmapLut(timer, testImage1, Range2d(6, 23, 2, 4)) and
-        commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
-            testOneRandomLut(timer, sizeX, sizeY, restriction)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneRandomLut(
-        timer: TimingTracker,
-        sizeX: Int,
-        sizeY: Int,
-        restriction: Range2d?
-    ): Boolean {
-        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, 4)
-        val newRed = randomByteArray(0x32425, 256, 1, 1)
-        val newGreen = randomByteArray(0x1F3225, 256, 1, 1)
-        val newBlue = randomByteArray(0x32D4F27, 256, 1, 1)
-        val newAlpha = randomByteArray(0x3A20001, 256, 1, 1)
-        val table = LookupTable()
-        table.red = newRed
-        table.blue = newBlue
-        table.green = newGreen
-        table.alpha = newAlpha
-
-        val intrinsicOutArray = timer.measure("IntrinsicLUT") {
-            intrinsicLut(
-                renderscriptContext, inputArray, sizeX, sizeY, newRed, newGreen, newBlue, newAlpha,
-                restriction
-            )
-        }
-        val toolkitOutArray = timer.measure("ToolkitLUT") {
-            toolkit.lut(inputArray, sizeX, sizeY, table, restriction)
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceLUT") {
-            referenceLut(inputArray, sizeX, sizeY, table, restriction)
-        }
-
-        return validateSame("LUT", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
-            println("lut ($sizeX, $sizeY) $restriction")
-            logArray("LUT red  ", newRed, 256)
-            logArray("LUT green", newGreen, 256)
-            logArray("LUT blue ", newBlue, 256)
-            logArray("LUT alpha", newAlpha, 256)
-            logArray("LUT in           ", inputArray)
-            logArray("LUT reference out", referenceOutArray)
-            logArray("LUT intrinsic out", intrinsicOutArray)
-            logArray("LUT toolkit   out", toolkitOutArray)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneBitmapLut(
-        timer: TimingTracker,
-        bitmap: Bitmap,
-        restriction: Range2d?
-    ): Boolean {
-        val newRed = randomByteArray(0x32425, 256, 1, 1)
-        val newGreen = randomByteArray(0x1F3225, 256, 1, 1)
-        val newBlue = randomByteArray(0x32D4F27, 256, 1, 1)
-        val newAlpha = randomByteArray(0x3A20001, 256, 1, 1)
-        val table = LookupTable()
-        table.red = newRed
-        table.blue = newBlue
-        table.green = newGreen
-        table.alpha = newAlpha
-
-        val intrinsicOutArray = timer.measure("IntrinsicLUT") {
-            intrinsicLut(
-                renderscriptContext, bitmap, newRed, newGreen, newBlue, newAlpha, restriction
-            )
-        }
-        val toolkitOutBitmap = timer.measure("ToolkitLUT") {
-            toolkit.lut(bitmap, table, restriction)
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceLUT") {
-            referenceLut(
-                getBitmapBytes(bitmap),
-                bitmap.width,
-                bitmap.height,
-                table,
-                restriction
-            )
-        }
-
-        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
-        return validateSame("LutBitmap", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
-            println("LutBitmap $restriction")
-            logArray("LutBitmap red  ", newRed, 256)
-            logArray("LutBitmap green", newGreen, 256)
-            logArray("LutBitmap blue ", newBlue, 256)
-            logArray("LutBitmap alpha", newAlpha, 256)
-            //logArray("LutBitmap in           ", inputArray, 80)
-            logArray("LutBitmap reference out", referenceOutArray)
-            logArray("LutBitmap intrinsic out", intrinsicOutArray)
-            logArray("LutBitmap toolkit   out", toolkitOutArray)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testLut3d(timer: TimingTracker): Boolean {
-        val cubeSizesToTry = listOf(
-            Dimension(2, 2, 2),
-            Dimension(32, 32, 16),
-            Dimension(256, 256, 256)
-        )
-        return cubeSizesToTry.all { cubeSize ->
-                val identityCube = identityCube(cubeSize)
-                val randomCube = randomCube(0x23424, cubeSize)
-                testOneBitmapLut3d(timer, testImage1, cubeSize, identityCube, 1, null) and
-                        testOneBitmapLut3d(timer, testImage2, cubeSize, randomCube, 3, null) and
-                        testOneBitmapLut3d(timer, testImage2, cubeSize, randomCube, 3, Range2d(6, 23, 2, 4)) and
-                commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
-                    testOneRandomLut3d(timer, sizeX, sizeY, cubeSize, identityCube, 1, restriction) &&
-                            testOneRandomLut3d(
-                                timer,
-                                sizeX,
-                                sizeY,
-                                cubeSize,
-                                randomCube,
-                                3,
-                                restriction
-                            )
-                }
-            }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneRandomLut3d(
-        timer: TimingTracker,
-        sizeX: Int,
-        sizeY: Int,
-        cubeSize: Dimension,
-        cubeArray: ByteArray,
-        allowedIntError: Int, restriction: Range2d?
-    ): Boolean {
-        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, 4)
-
-        val intrinsicOutArray = timer.measure("IntrinsicLut3d") {
-            intrinsicLut3d(
-                renderscriptContext, inputArray, sizeX, sizeY, cubeArray, cubeSize, restriction
-            )
-        }
-        val toolkitOutArray = timer.measure("ToolkitLut3d") {
-            val toolkitCube = Rgba3dArray(cubeArray, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
-            toolkit.lut3d(inputArray, sizeX, sizeY, toolkitCube, restriction)
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceLut3d") {
-            val cube = Rgba3dArray(cubeArray, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
-            referenceLut3d(inputArray, sizeX, sizeY, cube, restriction)
-        }
-
-        return validateSame(
-            "lut3d",
-            intrinsicOutArray,
-            referenceOutArray,
-            toolkitOutArray,
-            false,
-            allowedIntError
-        ) {
-            println("lut3d ($sizeX, $sizeY) $restriction")
-            logArray("lut3d cube", cubeArray, 256)
-            logArray("lut3d in           ", inputArray, 64)
-            logArray("lut3d reference out", referenceOutArray, 64)
-            logArray("lut3d intrinsic out", intrinsicOutArray, 64)
-            logArray("lut3d toolkit   out", toolkitOutArray)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneBitmapLut3d(
-        timer: TimingTracker,
-        bitmap: Bitmap,
-        cubeSize: Dimension,
-        cubeArray: ByteArray,
-        allowedIntError: Int, restriction: Range2d?
-    ): Boolean {
-        val intrinsicOutArray = timer.measure("IntrinsicLut3d") {
-            intrinsicLut3d(renderscriptContext, bitmap, cubeArray, cubeSize, restriction)
-        }
-        val toolkitOutBitmap = timer.measure("ToolkitLut3d") {
-            val toolkitCube = Rgba3dArray(cubeArray, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
-            toolkit.lut3d(bitmap, toolkitCube, restriction)
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceLut3d") {
-            val cube = Rgba3dArray(cubeArray, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
-            referenceLut3d(getBitmapBytes(bitmap), bitmap.width, bitmap.height, cube, restriction)
-        }
-
-        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
-        return validateSame(
-            "Lut3dBitmap",
-            intrinsicOutArray,
-            referenceOutArray,
-            toolkitOutArray,
-            false,
-            allowedIntError
-        ) {
-            println("Lut3dBitmap $restriction")
-            logArray("Lut3dBitmap cube", cubeArray, 256)
-            //logArray("Lut3dBitmap in           ", inputArray, 64)
-            logArray("Lut3dBitmap reference out", referenceOutArray, 64)
-            logArray("Lut3dBitmap intrinsic out", intrinsicOutArray, 64)
-            logArray("Lut3dBitmap toolkit   out", toolkitOutArray)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testResize(timer: TimingTracker): Boolean {
-        val factorsToTry = listOf(
-            Pair(1f, 1f),
-            Pair(0.5f, 1f),
-            Pair(2f, 2f),
-            Pair(0.5f, 2f),
-            Pair(2f, 0.5f),
-            // The RenderScript Intrinsic tests used the above factors. It's tempting to use
-            // less regular ones like Pair(6.37f, 0.17f) however this creates small offset
-            // errors between the result provided by the C++ code and the SIMD code. This is
-            // due to the SIMD code using a scaled integer to increment going from one pixel to the
-            // next, while the C++ code uses float operations.
-        )
-        val layoutsToTry = listOf(
-            TestLayout(37, 47, null),
-            TestLayout(60, 10, null),
-            TestLayout(6, 4, Range2d(1, 3, 0, 2)),
-            TestLayout(10, 14, Range2d(2, 3, 3, 7)),
-        )
-
-        return factorsToTry.all { (scaleX, scaleY) ->
-            // Do one resize that's greater than 4x, as that's used in the code but don't do it
-            // for everything, as some images will get very large
-            testOneRandomResize(timer, 1, 25, 30, 6f, 6f, null) and
-            testOneBitmapResize(timer, testImage1, scaleX, scaleY, null) and
-                    testOneBitmapResize(timer, testImage1, scaleX, scaleY, Range2d(6, 23, 2, 4)) and
-                    layoutsToTry.all { (sizeX, sizeY, restriction) ->
-                        (1..4).all { vectorSize ->
-                            testOneRandomResize(
-                                timer,
-                                vectorSize,
-                                sizeX,
-                                sizeY,
-                                scaleX,
-                                scaleY,
-                                restriction
-                            )
-                        }
-                    }
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneRandomResize(
-        timer: TimingTracker,
-        vectorSize: Int,
-        inSizeX: Int,
-        inSizeY: Int,
-        scaleX: Float,
-        scaleY: Float,
-        restriction: Range2d?
-    ): Boolean {
-        val inputArray = randomByteArray(0x50521f0, inSizeX, inSizeY, paddedSize(vectorSize))
-        val outSizeX = (inSizeX * scaleX).toInt()
-        val outSizeY = (inSizeY * scaleY).toInt()
-
-        val intrinsicOutArray = timer.measure("IntrinsicResize") {
-            intrinsicResize(
-                renderscriptContext, inputArray, vectorSize, inSizeX, inSizeY, outSizeX, outSizeY,
-                restriction
-            )
-        }
-        val toolkitOutArray = timer.measure("ToolkitResize") {
-            toolkit.resize(
-                inputArray, vectorSize, inSizeX, inSizeY, outSizeX, outSizeY, restriction
-            )
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceResize") {
-            referenceResize(
-                inputArray, vectorSize, inSizeX, inSizeY, outSizeX, outSizeY, restriction
-            )
-        }
-
-        return validateSame("resize", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
-            println("resize $vectorSize ($inSizeX, $inSizeY) by ($scaleX, $scaleY) to ($outSizeX, $outSizeY), $restriction")
-            logArray("resize in           ", inputArray)
-            logArray("resize reference out", referenceOutArray)
-            logArray("resize intrinsic out", intrinsicOutArray)
-            logArray("resize toolkit   out", toolkitOutArray)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneBitmapResize(
-        timer: TimingTracker,
-        bitmap: Bitmap,
-        scaleX: Float,
-        scaleY: Float,
-        restriction: Range2d?
-    ): Boolean {
-        // println("Doing resize $inSizeX x $inSizeY x $vectorSize, $scaleX x $scaleY, $restriction")
-        val outSizeX = (bitmap.width * scaleX).toInt()
-        val outSizeY = (bitmap.height * scaleY).toInt()
-
-        val intrinsicOutArray = timer.measure("IntrinsicResize") {
-            intrinsicResize(renderscriptContext, bitmap, outSizeX, outSizeY, restriction)
-        }
-        val toolkitOutBitmap = timer.measure("ToolkitResize") {
-            toolkit.resize(bitmap, outSizeX, outSizeY, restriction)
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceResize") {
-            referenceResize(
-                getBitmapBytes(bitmap),
-                vectorSizeOfBitmap(bitmap),
-                bitmap.width,
-                bitmap.height,
-                outSizeX,
-                outSizeY,
-                restriction
-            )
-        }
-
-        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
-        return validateSame("ResizeBitmap", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
-            println("ResizeBitmap by ($scaleX, $scaleY) to ($outSizeX, $outSizeY), $restriction")
-            //logArray("ResizeBitmap in           ", inputArray, 100)
-            logArray("ResizeBitmap reference out", referenceOutArray)
-            logArray("ResizeBitmap intrinsic out", intrinsicOutArray)
-            logArray("ResizeBitmap toolkit   out", toolkitOutArray)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testYuvToRgb(timer: TimingTracker): Boolean {
-        val layoutsToTry = listOf(
-            // Don't try sizeX with odd values. That's not allowed by definition of some
-            // of the video formats.
-            TestLayout(10, 14, null),
-            TestLayout(64, 40, null),
-            TestLayout(96, 94, null),
-        )
-        return layoutsToTry.all { (sizeX, sizeY, _) ->
-            YuvFormat.values().all { format ->
-                testOneRandomYuvToRgb(timer, sizeX, sizeY, format) and
-                testOneRandomYuvToRgbBitmap(timer, sizeX, sizeY, format)
-            }
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneRandomYuvToRgb(
-        timer: TimingTracker,
-        sizeX: Int,
-        sizeY: Int,
-        format: YuvFormat
-    ): Boolean {
-        // The RenderScript Intrinsic does not handle this combination correctly.
-        if (format == YuvFormat.YV12 && sizeX % 32 != 0) {
-            return true
-        }
-        val inputArray = randomYuvArray(0x50521f0, sizeX, sizeY, format)
-
-        val intrinsicOutArray = timer.measure("IntrinsicYuvToRgb") {
-            intrinsicYuvToRgb(renderscriptContext, inputArray, sizeX, sizeY, format)
-        }
-        val toolkitOutArray = timer.measure("ToolkitYuvToRgb") {
-            toolkit.yuvToRgb(inputArray, sizeX, sizeY, format)
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceYuvToRgb") {
-            referenceYuvToRgb(inputArray, sizeX, sizeY, format)
-        }
-
-        return validateSame("yuvToRgb", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
-            println("yuvToRgb ($sizeX, $sizeY) $format")
-            logArray("yuvToRgb in           ", inputArray)
-            logArray("yuvToRgb reference out", referenceOutArray)
-            logArray("yuvToRgb intrinsic out", intrinsicOutArray)
-            logArray("yuvToRgb toolkit   out", toolkitOutArray)
-        }
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun testOneRandomYuvToRgbBitmap(
-        timer: TimingTracker,
-        sizeX: Int,
-        sizeY: Int,
-        format: YuvFormat
-    ): Boolean {
-        // The RenderScript Intrinsic does not handle this combination correctly.
-        if (format == YuvFormat.YV12 && sizeX % 32 != 0) {
-            return true
-        }
-        val inputArray = randomYuvArray(0x50521f0, sizeX, sizeY, format)
-
-        val intrinsicOutArray = timer.measure("IntrinsicYuvToRgb") {
-            intrinsicYuvToRgb(renderscriptContext, inputArray, sizeX, sizeY, format)
-        }
-        val toolkitOutBitmap = timer.measure("ToolkitYuvToRgb") {
-            toolkit.yuvToRgbBitmap(inputArray, sizeX, sizeY, format)
-        }
-        if (!validate) return true
-
-        val referenceOutArray = timer.measure("ReferenceYuvToRgb") {
-            referenceYuvToRgb(inputArray, sizeX, sizeY, format)
-        }
-
-        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
-        return validateSame("yuvToRgb", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
-            println("yuvToRgb ($sizeX, $sizeY) $format")
-            logArray("yuvToRgb in           ", inputArray)
-            logArray("yuvToRgb reference out", referenceOutArray)
-            logArray("yuvToRgb intrinsic out", intrinsicOutArray)
-            logArray("yuvToRgb toolkit   out", toolkitOutArray)
-        }
-    }
-
-    /**
-     * Verifies that the arrays returned by the Intrinsic, the reference code, and the Toolkit
-     * are all within a margin of error.
-     *
-     * RenderScript Intrinsic test (rc/android/cts/rscpp/RSCppTest.java) used 3 for ints.
-     * For floats, rc/android/cts/rscpp/verify.rscript uses 0.0001f.
-     */
-    @ExperimentalUnsignedTypes
-    private fun validateSame(
-        task: String,
-        intrinsic: ByteArray,
-        reference: ByteArray,
-        toolkit: ByteArray,
-        skipFourth: Boolean = false,
-        allowedIntDelta: Int = 3,
-        errorLogging: () -> Unit
-    ): Boolean {
-        val success = validateAgainstReference(
-            task, reference, "Intrinsic", intrinsic, skipFourth, allowedIntDelta
-        ) and validateAgainstReference(
-            task, reference, "Toolkit", toolkit, skipFourth, allowedIntDelta
-        )
-        if (!success) {
-            println("$task FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!")
-            errorLogging()
-        }
-        return success
-    }
-
-    private fun validateSame(
-        task: String,
-        intrinsic: IntArray,
-        reference: IntArray,
-        toolkit: IntArray,
-        allowedIntDelta: Int = 3,
-        errorLogging: () -> Unit
-    ): Boolean {
-        val success = validateAgainstReference(
-            task, reference, "Intrinsic", intrinsic, allowedIntDelta
-        ) and validateAgainstReference(
-            task, reference, "Toolkit", toolkit, allowedIntDelta
-        )
-        if (!success) {
-            println("$task FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!")
-            errorLogging()
-        }
-        return success
-    }
-
-    @ExperimentalUnsignedTypes
-    private fun validateAgainstReference(
-        task: String,
-        in1: ByteArray,
-        name2: String,
-        in2: ByteArray,
-        skipFourth: Boolean,
-        allowedIntDelta: Int
-    ): Boolean {
-        if (in1.size != in2.size) {
-            println("$task. Sizes don't match: Reference ${in1.size}, $name2 ${in2.size}")
-            return false
-        }
-        var same = true
-        val maxDetails = 80
-        val diffs = CharArray(min(in1.size, maxDetails)) {'.'}
-        for (i in in1.indices) {
-            if (skipFourth && i % 4 == 3) {
-                continue
-            }
-            val delta = abs(in1[i].toUByte().toInt() - in2[i].toUByte().toInt())
-            if (delta > allowedIntDelta) {
-                if (same) {
-                    println(
-                        "$task. At $i, Reference is ${in1[i].toUByte()}, $name2 is ${in2[i].toUByte()}"
-                    )
-                }
-                if (i < maxDetails) diffs[i] = 'X'
-                same = false
-            }
-        }
-        if (!same) {
-            for (i in 0 until (min(in1.size, maxDetails) / 4)) print("%-3d|".format(i))
-            println()
-            println(diffs)
-        }
-        return same
-    }
-
-    private fun validateAgainstReference(
-        task: String,
-        in1: IntArray,
-        name2: String,
-        in2: IntArray,
-        allowedIntDelta: Int
-    ): Boolean {
-        if (in1.size != in2.size) {
-            println("$task. Sizes don't match: Reference ${in1.size}, $name2 ${in2.size}")
-            return false
-        }
-        for (i in in1.indices) {
-            val delta = abs(in1[i] - in2[i])
-            if (delta > allowedIntDelta) {
-                println("$task. At $i, Reference is ${in1[i]}, $name2 is ${in2[i]}")
-                return false
-            }
-        }
-        return true
-    }
-}

diff --git a/toolkit/test/Android.bp b/toolkit/test/Android.bp
deleted file mode 100644
index abeace1..0000000
--- a/toolkit/test/Android.bp
+++ /dev/null

@@ -1,35 +0,0 @@
-//
-// Copyright (C) 2021 The Android Open Source Project
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-package {
-    default_applicable_licenses: ["Android-Apache-2.0"],
-}
-
-android_app {
-    name: "RenderScriptToolkitTest",
-//    srcs: ["src/**/*.kt"],
-    sdk_version: "current",
-    resource_dirs: ["res"],
-//    jni_libs: [ "librenderscripttoolkit"],
-//    certificate: "platform",
-//    //product_specific: true,
-//    //optimize: {
-//    //    proguard_flags_files: ["proguard.flags"],
-//    //},
-//    shared_libs: ["librenderscripttoolkit",
-//
-//    ]
-}

diff --git a/toolkit/test/BufferUtils.kt b/toolkit/test/BufferUtils.kt
deleted file mode 100644
index f2197b0..0000000
--- a/toolkit/test/BufferUtils.kt
+++ /dev/null

@@ -1,508 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.graphics.Bitmap
-import android.graphics.Canvas
-import android.renderscript.Element
-import android.renderscript.RenderScript
-import android.renderscript.toolkit.Range2d
-import android.renderscript.toolkit.Rgba3dArray
-import android.renderscript.toolkit.YuvFormat
-import java.nio.ByteBuffer
-import java.util.Random
-import kotlin.math.floor
-import kotlin.math.max
-import kotlin.math.min
-
-/**
- * A vector of 4 integers.
- */
-class Int4(
-    var x: Int = 0,
-    var y: Int = 0,
-    var z: Int = 0,
-    var w: Int = 0
-) {
-    operator fun plus(other: Int4) = Int4(x + other.x, y + other.y, z + other.z, w + other.w)
-    operator fun plus(n: Int) = Int4(x + n, y + n, z + n, w + n)
-
-    operator fun minus(other: Int4) = Int4(x - other.x, y - other.y, z - other.z, w - other.w)
-    operator fun minus(n: Int) = Int4(x - n, y - n, z - n, w - n)
-
-    operator fun times(other: Int4) = Int4(x * other.x, y * other.y, z * other.z, w * other.w)
-    operator fun times(n: Int) = Int4(x * n, y * n, z * n, w * n)
-
-    fun toFloat4() = Float4(x.toFloat(), y.toFloat(), z.toFloat(), w.toFloat())
-}
-
-fun min(a: Int4, b: Int4) = Int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w))
-
-/**
- * A vector of 4 floats.
- */
-data class Float4(
-    var x: Float = 0f,
-    var y: Float = 0f,
-    var z: Float = 0f,
-    var w: Float = 0f
-) {
-    operator fun plus(other: Float4) = Float4(x + other.x, y + other.y, z + other.z, w + other.w)
-    operator fun plus(f: Float) = Float4(x + f, y + f, z + f, w + f)
-
-    operator fun minus(other: Float4) = Float4(x - other.x, y - other.y, z - other.z, w - other.w)
-    operator fun minus(f: Float) = Float4(x - f, y - f, z - f, w - f)
-
-    operator fun times(other: Float4) = Float4(x * other.x, y * other.y, z * other.z, w * other.w)
-    operator fun times(f: Float) = Float4(x * f, y * f, z * f, w * f)
-
-    operator fun div(other: Float4) = Float4(x / other.x, y / other.y, z / other.z, w / other.w)
-    operator fun div(f: Float) = Float4(x / f, y / f, z / f, w / f)
-
-    fun intFloor() = Int4(floor(x).toInt(), floor(y).toInt(), floor(z).toInt(), floor(w).toInt())
-}
-
-/**
- * Convert a UByteArray to a Float4 vector
- */
-@ExperimentalUnsignedTypes
-fun UByteArray.toFloat4(): Float4 {
-    require(size == 4)
-    return Float4(this[0].toFloat(), this[1].toFloat(), this[2].toFloat(), this[3].toFloat())
-}
-
-/**
- * Convert a ByteArray to a Float4 vector
- */
-@ExperimentalUnsignedTypes
-fun ByteArray.toFloat4(): Float4 {
-    require(size == 4)
-    return Float4(
-        this[0].toUByte().toFloat(),
-        this[1].toUByte().toFloat(),
-        this[2].toUByte().toFloat(),
-        this[3].toUByte().toFloat()
-    )
-}
-
-data class Dimension(val sizeX: Int, val sizeY: Int, val sizeZ: Int)
-
-/**
- * An RGBA value represented by 4 Int.
- *
- * Note that the arithmetical operations consider a 0..255 value the equivalent of 0f..1f.
- * After adding or subtracting, the value is clamped. After multiplying, the value is rescaled to
- * stay in the 0..255 range. This is useful for the Blend operation.
- */
-@ExperimentalUnsignedTypes
-data class Rgba(
-    var r: Int = 0,
-    var g: Int = 0,
-    var b: Int = 0,
-    var a: Int = 0
-) {
-    operator fun plus(other: Rgba) =
-        Rgba(r + other.r, g + other.g, b + other.b, a + other.a).clampToUByteRange()
-
-    operator fun minus(other: Rgba) =
-        Rgba(r - other.r, g - other.g, b - other.b, a - other.a).clampToUByteRange()
-
-    operator fun times(other: Rgba) = Rgba(r * other.r, g * other.g, b * other.b, a * other.a) shr 8
-    operator fun times(scalar: Int) = Rgba(r * scalar, g * scalar, b * scalar, a * scalar) shr 8
-
-    infix fun xor(other: Rgba) = Rgba(r xor other.r, g xor other.g, b xor other.b, a xor other.a)
-
-    infix fun shr(other: Int) = Rgba(r shr other, g shr other, b shr other, a shr other)
-
-    private fun clampToUByteRange() = Rgba(
-        r.clampToUByteRange(),
-        g.clampToUByteRange(),
-        b.clampToUByteRange(),
-        a.clampToUByteRange()
-    )
-}
-
-/**
- * A 2D array of UByte vectors, stored in row-major format.
- *
- * Arrays of vectorSize == 3 are padded to 4.
- */
-@ExperimentalUnsignedTypes
-class Vector2dArray(
-    val values: UByteArray,
-    val vectorSize: Int,
-    val sizeX: Int,
-    val sizeY: Int
-) {
-    /**
-     * If true, index access that would try to get a value that's out of bounds will simply
-     * return the border value instead. E.g. for [3, -3] would return the value for [3, 0],
-     * assuming that the sizeX > 3.
-     */
-    var clipReadToRange: Boolean = false
-
-    operator fun get(x: Int, y: Int): UByteArray {
-        var fixedX = x
-        var fixedY = y
-        if (clipReadToRange) {
-            fixedX = min(max(x, 0), sizeX - 1)
-            fixedY = min(max(y, 0), sizeY - 1)
-        } else {
-            require(x in 0 until sizeX && y in 0 until sizeY) { "Out of bounds" }
-        }
-        val start = indexOfVector(fixedX, fixedY)
-        return UByteArray(paddedSize(vectorSize)) { values[start + it] }
-    }
-
-    operator fun set(x: Int, y: Int, value: UByteArray) {
-        require(value.size == paddedSize(vectorSize)) { "Not the expected vector size" }
-        require(x in 0 until sizeX && y in 0 until sizeY) { "Out of bounds" }
-        val start = indexOfVector(x, y)
-        for (i in value.indices) {
-            values[start + i] = value[i]
-        }
-    }
-
-    private fun indexOfVector(x: Int, y: Int) = ((y * sizeX) + x) * paddedSize(vectorSize)
-
-    fun createSameSized() = Vector2dArray(UByteArray(values.size), vectorSize, sizeX, sizeY)
-
-    fun forEach(restriction: Range2d?, work: (Int, Int) -> (Unit)) {
-        forEachCell(sizeX, sizeY, restriction, work)
-    }
-}
-
-/**
- * A 2D array of float vectors, stored in row-major format.
- *
- * Arrays of vectorSize == 3 are padded to 4.
- */
-class FloatVector2dArray(
-    val values: FloatArray,
-    val vectorSize: Int,
-    val sizeX: Int,
-    val sizeY: Int
-) {
-    /**
-     * If true, index access that would try to get a value that's out of bounds will simply
-     * return the border value instead. E.g. for [3, -3] would return the value for [3, 0],
-     * assuming that the sizeX > 3.
-     */
-    var clipAccessToRange: Boolean = false
-
-    operator fun get(x: Int, y: Int): FloatArray {
-        var fixedX = x
-        var fixedY = y
-        if (clipAccessToRange) {
-            fixedX = min(max(x, 0), sizeX - 1)
-            fixedY = min(max(y, 0), sizeY - 1)
-        } else {
-            require(x in 0 until sizeX && y in 0 until sizeY) { "Out of bounds" }
-        }
-        val start = indexOfVector(fixedX, fixedY)
-        return FloatArray(vectorSize) { values[start + it] }
-    }
-
-    operator fun set(x: Int, y: Int, value: FloatArray) {
-        require(x in 0 until sizeX && y in 0 until sizeY) { "Out of bounds" }
-        val start = indexOfVector(x, y)
-        for (i in value.indices) {
-            values[start + i] = value[i]
-        }
-    }
-
-    private fun indexOfVector(x: Int, y: Int) = ((y * sizeX) + x) * paddedSize(vectorSize)
-
-    fun createSameSized() = FloatVector2dArray(FloatArray(values.size), vectorSize, sizeX, sizeY)
-
-    fun forEach(restriction: Range2d?, work: (Int, Int) -> (Unit)) {
-        forEachCell(sizeX, sizeY, restriction, work)
-    }
-}
-
-/**
- * A 2D array of RGBA data.
- */
-@ExperimentalUnsignedTypes
-class Rgba2dArray(
-    private val values: ByteArray,
-    val sizeX: Int,
-    val sizeY: Int
-) {
-    operator fun get(x: Int, y: Int): Rgba {
-        val i = indexOfVector(x, y)
-        return Rgba(
-            values[i].toUByte().toInt(),
-            values[i + 1].toUByte().toInt(),
-            values[i + 2].toUByte().toInt(),
-            values[i + 3].toUByte().toInt()
-        )
-    }
-
-    operator fun set(x: Int, y: Int, value: Rgba) {
-        // Verify that x, y, z, w are in the 0..255 range
-        require(value.r in 0..255)
-        require(value.g in 0..255)
-        require(value.b in 0..255)
-        require(value.a in 0..255)
-        val i = indexOfVector(x, y)
-        values[i] = value.r.toUByte().toByte()
-        values[i + 1] = value.g.toUByte().toByte()
-        values[i + 2] = value.b.toUByte().toByte()
-        values[i + 3] = value.a.toUByte().toByte()
-    }
-
-    private fun indexOfVector(x: Int, y: Int) = ((y * sizeX) + x) * 4
-
-    fun forEachCell(restriction: Range2d?, work: (Int, Int) -> (Unit)) =
-        forEachCell(sizeX, sizeY, restriction, work)
-}
-
-/**
- * Return a value that's between start and end, with the fraction indicating how far along.
- */
-fun mix(start: Float, end: Float, fraction: Float) = start + (end - start) * fraction
-
-fun mix(a: Float4, b: Float4, fraction: Float) = Float4(
-    mix(a.x, b.x, fraction),
-    mix(a.y, b.y, fraction),
-    mix(a.z, b.z, fraction),
-    mix(a.w, b.w, fraction)
-)
-
-/**
- * For vectors of size 3, the original RenderScript has them occupy the same space as a size 4.
- * While RenderScript had a method to avoid this padding, it did not apply to Intrinsics.
- *
- * To preserve compatibility, the Toolkit doing the same.
- */
-fun paddedSize(vectorSize: Int) = if (vectorSize == 3) 4 else vectorSize
-
-/**
- * Create a ByteArray of the specified size filled with random data.
- */
-fun randomByteArray(seed: Long, sizeX: Int, sizeY: Int, elementSize: Int): ByteArray {
-    val r = Random(seed)
-    return ByteArray(sizeX * sizeY * elementSize) { (r.nextInt(255) - 128).toByte() }
-}
-
-/**
- * Create a FloatArray of the specified size filled with random data.
- *
- * By default, the random data is between 0f and 1f. The factor can be used to scale that.
- */
-fun randomFloatArray(
-    seed: Long,
-    sizeX: Int,
-    sizeY: Int,
-    elementSize: Int,
-    factor: Float = 1f
-): FloatArray {
-    val r = Random(seed)
-    return FloatArray(sizeX * sizeY * elementSize) { r.nextFloat() * factor }
-}
-
-/**
- * Create a cube of the specified size filled with random data.
- */
-fun randomCube(seed: Long, cubeSize: Dimension): ByteArray {
-    val r = Random(seed)
-    return ByteArray(cubeSize.sizeX * cubeSize.sizeY * cubeSize.sizeZ * 4) {
-        (r.nextInt(255) - 128).toByte()
-    }
-}
-
-/**
- * Create the identity cube, i.e. one that if used in Lut3d, the output is the same as the input
- */
-@ExperimentalUnsignedTypes
-fun identityCube(cubeSize: Dimension): ByteArray {
-    val data = ByteArray(cubeSize.sizeX * cubeSize.sizeY * cubeSize.sizeZ * 4)
-    val cube = Rgba3dArray(data, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
-    for (z in 0 until cubeSize.sizeZ) {
-        for (y in 0 until cubeSize.sizeY) {
-            for (x in 0 until cubeSize.sizeX) {
-                cube[x, y, z] =
-                    byteArrayOf(
-                        (x * 255 / (cubeSize.sizeX - 1)).toByte(),
-                        (y * 255 / (cubeSize.sizeY - 1)).toByte(),
-                        (z * 255 / (cubeSize.sizeZ - 1)).toByte(),
-                        (255).toByte()
-                    )
-            }
-        }
-    }
-    return data
-}
-
-fun randomYuvArray(seed: Long, sizeX: Int, sizeY: Int, format: YuvFormat): ByteArray {
-    // YUV formats are not well defined for odd dimensions
-    require(sizeX % 2 == 0 && sizeY % 2 == 0)
-    val halfSizeX = sizeX / 2
-    val halfSizeY = sizeY / 2
-    var totalSize = 0
-    when (format) {
-        YuvFormat.YV12 -> {
-            val strideX = roundUpTo16(sizeX)
-            totalSize = strideX * sizeY + roundUpTo16(strideX / 2) * halfSizeY * 2
-        }
-        YuvFormat.NV21 -> totalSize = sizeX * sizeY + halfSizeX * halfSizeY * 2
-        else -> require(false) { "Unknown YUV format $format" }
-    }
-
-    return randomByteArray(seed, totalSize, 1, 1)
-}
-
-/**
- * Converts a float to a byte, clamping to make it fit the limited range.
- */
-@ExperimentalUnsignedTypes
-fun Float.clampToUByte(): UByte = min(255, max(0, (this + 0.5f).toInt())).toUByte()
-
-/**
- * Converts a FloatArray to UByteArray, clamping.
- */
-@ExperimentalUnsignedTypes
-fun FloatArray.clampToUByte() = UByteArray(size) { this[it].clampToUByte() }
-
-/**
- * Limits an Int to what can fit in a UByte.
- */
-fun Int.clampToUByteRange(): Int = min(255, max(0, this))
-
-/**
- * Converts an Int to a UByte, clamping.
- */
-@ExperimentalUnsignedTypes
-fun Int.clampToUByte(): UByte = this.clampToUByteRange().toUByte()
-
-/**
- * Converts a float (0f .. 1f) to a byte (0 .. 255)
- */
-@ExperimentalUnsignedTypes
-fun unitFloatClampedToUByte(num: Float): UByte = (num * 255f).clampToUByte()
-
-/**
- * Convert a byte (0 .. 255) to a float (0f .. 1f)
- */
-@ExperimentalUnsignedTypes
-fun byteToUnitFloat(num: UByte) = num.toFloat() * 0.003921569f
-
-@ExperimentalUnsignedTypes
-fun UByteArray.toFloatArray() = FloatArray(size) { this[it].toFloat() }
-
-/**
- * For each cell that's in the 2D array defined by sizeX and sizeY, and clipped down by the
- * restriction, invoke the work function.
- */
-fun forEachCell(sizeX: Int, sizeY: Int, restriction: Range2d?, work: (Int, Int) -> (Unit)) {
-    val startX = restriction?.startX ?: 0
-    val startY = restriction?.startY ?: 0
-    val endX = restriction?.endX ?: sizeX
-    val endY = restriction?.endY ?: sizeY
-    for (y in startY until endY) {
-        for (x in startX until endX) {
-            work(x, y)
-        }
-    }
-}
-
-operator fun FloatArray.times(other: FloatArray) = FloatArray(size) { this[it] * other[it] }
-operator fun FloatArray.times(other: Float) = FloatArray(size) { this[it] * other }
-operator fun FloatArray.plus(other: FloatArray) = FloatArray(size) { this[it] + other[it] }
-operator fun FloatArray.minus(other: FloatArray) = FloatArray(size) { this[it] - other[it] }
-
-fun renderScriptVectorElementForU8(rs: RenderScript?, vectorSize: Int): Element {
-    when (vectorSize) {
-        1 -> return Element.U8(rs)
-        2 -> return Element.U8_2(rs)
-        3 -> return Element.U8_3(rs)
-        4 -> return Element.U8_4(rs)
-    }
-    throw java.lang.IllegalArgumentException("RenderScriptToolkit tests. Only vectors of size 1-4 are supported. $vectorSize provided.")
-}
-
-fun renderScriptVectorElementForI32(rs: RenderScript?, vectorSize: Int): Element {
-    when (vectorSize) {
-        1 -> return Element.I32(rs)
-        2 -> return Element.I32_2(rs)
-        3 -> return Element.I32_3(rs)
-        4 -> return Element.I32_4(rs)
-    }
-    throw java.lang.IllegalArgumentException("RenderScriptToolkit tests. Only vectors of size 1-4 are supported. $vectorSize provided.")
-}
-
-/* When we'll handle floats
-fun renderScriptVectorElementForF32(rs: RenderScript?, vectorSize: Int): Element {
-    when (vectorSize) {
-        1 -> return Element.F32(rs)
-        2 -> return Element.F32_2(rs)
-        3 -> return Element.F32_3(rs)
-        4 -> return Element.F32_4(rs)
-    }
-    throw java.lang.IllegalArgumentException("RenderScriptToolkit tests. Only vectors of size 1-4 are supported. $vectorSize provided.")
-}*/
-
-fun renderScriptElementForBitmap(context: RenderScript, bitmap: Bitmap): Element {
-    return when (val config = bitmap.config) {
-        Bitmap.Config.ALPHA_8 -> Element.A_8(context)
-        Bitmap.Config.ARGB_8888 -> Element.RGBA_8888(context)
-        else -> throw IllegalArgumentException("RenderScript Toolkit can't support bitmaps with config $config.")
-    }
-}
-
-fun getBitmapBytes(bitmap: Bitmap): ByteArray {
-    val buffer: ByteBuffer = ByteBuffer.allocate(bitmap.byteCount)
-    bitmap.copyPixelsToBuffer(buffer)
-    return buffer.array()
-}
-
-fun vectorSizeOfBitmap(bitmap: Bitmap): Int {
-    return when (val config = bitmap.config) {
-        Bitmap.Config.ALPHA_8 -> 1
-        Bitmap.Config.ARGB_8888 -> 4
-        else -> throw IllegalArgumentException("RenderScript Toolkit can't support bitmaps with config $config.")
-    }
-}
-
-fun duplicateBitmap(original: Bitmap): Bitmap {
-    val copy = Bitmap.createBitmap(original.width, original.height, original.config)
-    val canvas = Canvas(copy)
-    canvas.drawBitmap(original, 0f, 0f, null)
-    return copy
-}
-
-@ExperimentalUnsignedTypes
-fun logArray(prefix: String, array: ByteArray, number: Int = 20) {
-    val values = array.joinToString(limit = number) { it.toUByte().toString() }
-    println("$prefix[${array.size}] $values}\n")
-}
-
-fun logArray(prefix: String, array: IntArray, number: Int = 20) {
-    val values = array.joinToString(limit = number)
-    println("$prefix[${array.size}] $values}\n")
-}
-
-fun logArray(prefix: String, array: FloatArray?, number: Int = 20) {
-    val values = array?.joinToString(limit = number) { "%.2f".format(it) } ?: "(null)"
-    println("$prefix[${array?.size}] $values}\n")
-}
-
-fun roundUpTo16(value: Int): Int {
-    require(value >= 0)
-    return (value + 15) and 15.inv()
-}

diff --git a/toolkit/test/IntrinsicBlend.kt b/toolkit/test/IntrinsicBlend.kt
deleted file mode 100644
index 873cb15..0000000
--- a/toolkit/test/IntrinsicBlend.kt
+++ /dev/null

@@ -1,188 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.graphics.Bitmap
-import android.renderscript.Allocation
-import android.renderscript.Element
-import android.renderscript.RenderScript
-import android.renderscript.Script
-import android.renderscript.ScriptIntrinsicBlend
-import android.renderscript.Type
-import android.renderscript.toolkit.BlendingMode
-import android.renderscript.toolkit.Range2d
-
-/**
- * Does a Blend operation using the RenderScript Intrinsics.
- */
-fun intrinsicBlend(
-    context: RenderScript,
-    mode: BlendingMode,
-    sourceArray: ByteArray,
-    destArray: ByteArray,
-    sizeX: Int,
-    sizeY: Int,
-    restriction: Range2d?
-) {
-    val scriptBlend = ScriptIntrinsicBlend.create(context, Element.U8_4(context))
-    val builder = Type.Builder(context, Element.U8_4(context))
-    builder.setX(sizeX)
-    builder.setY(sizeY)
-    val arrayType = builder.create()
-    val sourceAllocation = Allocation.createTyped(context, arrayType)
-    val destAllocation = Allocation.createTyped(context, arrayType)
-    sourceAllocation.copyFrom(sourceArray)
-    destAllocation.copyFrom(destArray)
-
-    callBlendForEach(scriptBlend, sourceAllocation, destAllocation, mode, restriction)
-    destAllocation.copyTo(destArray)
-
-    sourceAllocation.destroy()
-    destAllocation.destroy()
-    arrayType.destroy()
-    scriptBlend.destroy()
-}
-
-fun intrinsicBlend(
-    context: RenderScript,
-    mode: BlendingMode,
-    sourceBitmap: Bitmap,
-    destBitmap: Bitmap,
-    restriction: Range2d?
-) {
-    val scriptBlend = ScriptIntrinsicBlend.create(context, Element.U8_4(context))
-    val sourceAllocation = Allocation.createFromBitmap(context, sourceBitmap)
-    val destAllocation = Allocation.createFromBitmap(context, destBitmap)
-    sourceAllocation.copyFrom(sourceBitmap)
-    destAllocation.copyFrom(destBitmap)
-
-    callBlendForEach(scriptBlend, sourceAllocation, destAllocation, mode, restriction)
-    destAllocation.copyTo(destBitmap)
-
-    sourceAllocation.destroy()
-    destAllocation.destroy()
-    scriptBlend.destroy()
-}
-
-private fun callBlendForEach(
-    scriptBlend: ScriptIntrinsicBlend,
-    sourceAllocation: Allocation,
-    destAllocation: Allocation,
-    mode: BlendingMode,
-    restriction: Range2d?
-) {
-    if (restriction != null) {
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        when (mode) {
-            BlendingMode.CLEAR -> scriptBlend.forEachClear(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.SRC -> scriptBlend.forEachSrc(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.DST -> scriptBlend.forEachDst(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.SRC_OVER -> scriptBlend.forEachSrcOver(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.DST_OVER -> scriptBlend.forEachDstOver(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.SRC_IN -> scriptBlend.forEachSrcIn(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.DST_IN -> scriptBlend.forEachDstIn(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.SRC_OUT -> scriptBlend.forEachSrcOut(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.DST_OUT -> scriptBlend.forEachDstOut(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.SRC_ATOP -> scriptBlend.forEachSrcAtop(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.DST_ATOP -> scriptBlend.forEachDstAtop(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.XOR -> scriptBlend.forEachXor(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.MULTIPLY -> scriptBlend.forEachMultiply(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.ADD -> scriptBlend.forEachAdd(
-                sourceAllocation, destAllocation, options
-            )
-            BlendingMode.SUBTRACT -> scriptBlend.forEachSubtract(
-                sourceAllocation, destAllocation, options
-            )
-        }
-    } else {
-        when (mode) {
-            BlendingMode.CLEAR -> scriptBlend.forEachClear(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.SRC -> scriptBlend.forEachSrc(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.DST -> scriptBlend.forEachDst(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.SRC_OVER -> scriptBlend.forEachSrcOver(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.DST_OVER -> scriptBlend.forEachDstOver(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.SRC_IN -> scriptBlend.forEachSrcIn(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.DST_IN -> scriptBlend.forEachDstIn(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.SRC_OUT -> scriptBlend.forEachSrcOut(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.DST_OUT -> scriptBlend.forEachDstOut(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.SRC_ATOP -> scriptBlend.forEachSrcAtop(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.DST_ATOP -> scriptBlend.forEachDstAtop(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.XOR -> scriptBlend.forEachXor(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.MULTIPLY -> scriptBlend.forEachMultiply(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.ADD -> scriptBlend.forEachAdd(
-                sourceAllocation, destAllocation
-            )
-            BlendingMode.SUBTRACT -> scriptBlend.forEachSubtract(
-                sourceAllocation, destAllocation
-            )
-        }
-    }
-}

diff --git a/toolkit/test/IntrinsicBlur.kt b/toolkit/test/IntrinsicBlur.kt
deleted file mode 100644
index be09094..0000000
--- a/toolkit/test/IntrinsicBlur.kt
+++ /dev/null

@@ -1,108 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.graphics.Bitmap
-import android.renderscript.Allocation
-import android.renderscript.Element
-import android.renderscript.RenderScript
-import android.renderscript.Script
-import android.renderscript.ScriptIntrinsicBlur
-import android.renderscript.Type
-import android.renderscript.toolkit.Range2d
-
-/**
- * Does a Blur operation using the RenderScript Intrinsics.
- */
-fun intrinsicBlur(
-    context: RenderScript,
-    inputArray: ByteArray,
-    vectorSize: Int,
-    sizeX: Int,
-    sizeY: Int,
-    radius: Int,
-    restriction: Range2d?
-): ByteArray {
-    val scriptBlur = ScriptIntrinsicBlur.create(
-        context,
-        if (vectorSize == 4) Element.RGBA_8888(context) else Element.U8(context)
-    )
-    val builder =
-        Type.Builder(
-            context,
-            renderScriptVectorElementForU8(context, vectorSize)
-        )
-    builder.setX(sizeX)
-    builder.setY(sizeY)
-    val arrayType = builder.create()
-    val inputAllocation = Allocation.createTyped(context, arrayType)
-    inputAllocation.copyFrom(inputArray)
-    val outAllocation = Allocation.createTyped(context, arrayType)
-
-    val intrinsicOutArray = ByteArray(sizeX * sizeY * vectorSize)
-    scriptBlur.setRadius(radius.toFloat())
-    scriptBlur.setInput(inputAllocation)
-
-    if (restriction != null) {
-        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptBlur.forEach(outAllocation, options)
-    } else {
-        scriptBlur.forEach(outAllocation)
-    }
-    outAllocation.copyTo(intrinsicOutArray)
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    arrayType.destroy()
-    scriptBlur.destroy()
-    return intrinsicOutArray
-}
-
-fun intrinsicBlur(
-    context: RenderScript,
-    bitmap: Bitmap,
-    radius: Int,
-    restriction: Range2d?
-): ByteArray {
-    val baseElement = renderScriptElementForBitmap(context, bitmap)
-    val scriptBlur = ScriptIntrinsicBlur.create(context, baseElement)
-    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
-    inputAllocation.copyFrom(bitmap)
-    val outAllocation = Allocation.createTyped(context, inputAllocation.type)
-    val intrinsicOutArray = ByteArray(bitmap.byteCount)
-
-    scriptBlur.setRadius(radius.toFloat())
-    scriptBlur.setInput(inputAllocation)
-
-    if (restriction != null) {
-        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptBlur.forEach(outAllocation, options)
-    } else {
-        scriptBlur.forEach(outAllocation)
-    }
-    outAllocation.copyTo(intrinsicOutArray)
-
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    scriptBlur.destroy()
-    return intrinsicOutArray
-}

diff --git a/toolkit/test/IntrinsicColorMatrix.kt b/toolkit/test/IntrinsicColorMatrix.kt
deleted file mode 100644
index c0ccc67..0000000
--- a/toolkit/test/IntrinsicColorMatrix.kt
+++ /dev/null

@@ -1,162 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.graphics.Bitmap
-import android.renderscript.Allocation
-import android.renderscript.Matrix4f
-import android.renderscript.RenderScript
-import android.renderscript.Script
-import android.renderscript.ScriptIntrinsicColorMatrix
-import android.renderscript.Type
-import android.renderscript.Float4
-import android.renderscript.toolkit.Range2d
-
-/**
- * Does a ColorMatrix operation using the RenderScript Intrinsics.
- */
-fun intrinsicColorMatrix(
-    context: RenderScript,
-    conversion: Tester.ColorMatrixConversionType,
-    inputArray: ByteArray,
-    inputVectorSize: Int,
-    sizeX: Int,
-    sizeY: Int,
-    outputVectorSize: Int,
-    matrix: FloatArray,
-    addVector: FloatArray,
-    restriction: Range2d?
-): ByteArray {
-    val scriptColorMatrix = ScriptIntrinsicColorMatrix.create(context)
-    val inputBuilder = Type.Builder(
-        context, renderScriptVectorElementForU8(
-            context,
-            inputVectorSize
-        )
-    )
-    inputBuilder.setX(sizeX)
-    inputBuilder.setY(sizeY)
-    val inputArrayType = inputBuilder.create()
-    val inputAllocation = Allocation.createTyped(context, inputArrayType)
-    val outputBuilder = Type.Builder(
-        context, renderScriptVectorElementForU8(
-            context,
-            outputVectorSize
-        )
-    )
-    outputBuilder.setX(sizeX)
-    outputBuilder.setY(sizeY)
-    val outputArrayType = outputBuilder.create()
-    val outAllocation = Allocation.createTyped(context, outputArrayType)
-
-    inputAllocation.copyFrom(inputArray)
-    val intrinsicOutArray = ByteArray(sizeX * sizeY * paddedSize(outputVectorSize))
-    when (conversion) {
-        Tester.ColorMatrixConversionType.RGB_TO_YUV -> scriptColorMatrix.setRGBtoYUV()
-        Tester.ColorMatrixConversionType.YUV_TO_RGB -> scriptColorMatrix.setYUVtoRGB()
-        Tester.ColorMatrixConversionType.GREYSCALE -> scriptColorMatrix.setGreyscale()
-        Tester.ColorMatrixConversionType.RANDOM -> {
-            val m = Matrix4f()
-            var index = 0
-            // RS is column major
-            for (x in 0..3) {
-                for (y in 0..3) {
-                    m.set(x, y, matrix[index++])
-                }
-            }
-            scriptColorMatrix.setColorMatrix(m)
-        }
-    }
-    val vector = Float4(
-        addVector[0],
-        addVector[1],
-        addVector[2],
-        addVector[3]
-    )
-    scriptColorMatrix.setAdd(vector)
-    if (restriction != null) {
-        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptColorMatrix.forEach(inputAllocation, outAllocation, options)
-    } else {
-        scriptColorMatrix.forEach(inputAllocation, outAllocation)
-    }
-    outAllocation.copyTo(intrinsicOutArray)
-
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    inputArrayType.destroy()
-    outputArrayType.destroy()
-    scriptColorMatrix.destroy()
-    return intrinsicOutArray
-}
-
-fun intrinsicColorMatrix(
-    context: RenderScript,
-    conversion: Tester.ColorMatrixConversionType,
-    bitmap: Bitmap,
-    matrix: FloatArray,
-    addVector: FloatArray,
-    restriction: Range2d?
-): ByteArray {
-    val scriptColorMatrix = ScriptIntrinsicColorMatrix.create(context)
-    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
-    inputAllocation.copyFrom(bitmap)
-    val outAllocation = Allocation.createTyped(context, inputAllocation.type)
-    val intrinsicOutArray = ByteArray(bitmap.byteCount)
-
-    when (conversion) {
-        Tester.ColorMatrixConversionType.RGB_TO_YUV -> scriptColorMatrix.setRGBtoYUV()
-        Tester.ColorMatrixConversionType.YUV_TO_RGB -> scriptColorMatrix.setYUVtoRGB()
-        Tester.ColorMatrixConversionType.GREYSCALE -> scriptColorMatrix.setGreyscale()
-        Tester.ColorMatrixConversionType.RANDOM -> {
-            val m = Matrix4f()
-            var index = 0
-            // RS is column major
-            for (x in 0..3) {
-                for (y in 0..3) {
-                    m.set(x, y, matrix[index++])
-                }
-            }
-            scriptColorMatrix.setColorMatrix(m)
-        }
-    }
-    val vector = Float4(
-        addVector[0],
-        addVector[1],
-        addVector[2],
-        addVector[3]
-    )
-    scriptColorMatrix.setAdd(vector)
-    if (restriction != null) {
-        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptColorMatrix.forEach(inputAllocation, outAllocation, options)
-    } else {
-        scriptColorMatrix.forEach(inputAllocation, outAllocation)
-    }
-    outAllocation.copyTo(intrinsicOutArray)
-
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    scriptColorMatrix.destroy()
-    return intrinsicOutArray
-}

diff --git a/toolkit/test/IntrinsicConvolve.kt b/toolkit/test/IntrinsicConvolve.kt
deleted file mode 100644
index 0c9e4f0..0000000
--- a/toolkit/test/IntrinsicConvolve.kt
+++ /dev/null

@@ -1,140 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.graphics.Bitmap
-import android.renderscript.Allocation
-import android.renderscript.Element
-import android.renderscript.RenderScript
-import android.renderscript.Script
-import android.renderscript.ScriptIntrinsicConvolve3x3
-import android.renderscript.ScriptIntrinsicConvolve5x5
-import android.renderscript.Type
-import android.renderscript.toolkit.Range2d
-
-/**
- * Does a Convolve operation using the RenderScript Intrinsics.
- */
-fun intrinsicConvolve(
-    context: RenderScript,
-    inputArray: ByteArray,
-    vectorSize: Int,
-    sizeX: Int,
-    sizeY: Int,
-    coefficients: FloatArray,
-    restriction: Range2d?
-): ByteArray {
-    val baseElement = renderScriptVectorElementForU8(context, vectorSize)
-    val builder = Type.Builder(context, baseElement)
-    builder.setX(sizeX)
-    builder.setY(sizeY)
-    val arrayType = builder.create()
-    val inputAllocation = Allocation.createTyped(context, arrayType)
-    val outAllocation = Allocation.createTyped(context, arrayType)
-    inputAllocation.copyFrom(inputArray)
-    val intrinsicOutArray = ByteArray(sizeX * sizeY * paddedSize(vectorSize))
-    if (restriction != null) {
-        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
-    }
-    invokeConvolveKernel(
-        coefficients,
-        context,
-        baseElement,
-        inputAllocation,
-        restriction,
-        outAllocation
-    )
-    outAllocation.copyTo(intrinsicOutArray)
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    arrayType.destroy()
-    return intrinsicOutArray
-}
-
-fun intrinsicConvolve(
-    context: RenderScript,
-    bitmap: Bitmap,
-    coefficients: FloatArray,
-    restriction: Range2d?
-): ByteArray {
-    val baseElement = renderScriptElementForBitmap(context, bitmap)
-
-    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
-    val outAllocation = Allocation.createTyped(context, inputAllocation.type)
-    val intrinsicOutArray = ByteArray(bitmap.byteCount)
-    inputAllocation.copyFrom(bitmap)
-    if (restriction != null) {
-        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
-    }
-    invokeConvolveKernel(
-        coefficients,
-        context,
-        baseElement,
-        inputAllocation,
-        restriction,
-        outAllocation
-    )
-    outAllocation.copyTo(intrinsicOutArray)
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    return intrinsicOutArray
-}
-
-private fun invokeConvolveKernel(
-    coefficients: FloatArray,
-    context: RenderScript,
-    baseElement: Element,
-    inputAllocation: Allocation?,
-    restriction: Range2d?,
-    outAllocation: Allocation?
-) {
-    when (coefficients.size) {
-        9 -> {
-            val scriptConvolve3x3 =
-                ScriptIntrinsicConvolve3x3.create(context, baseElement)
-            scriptConvolve3x3.setCoefficients(coefficients)
-            scriptConvolve3x3.setInput(inputAllocation)
-            if (restriction != null) {
-                val options = Script.LaunchOptions()
-                options.setX(restriction.startX, restriction.endX)
-                options.setY(restriction.startY, restriction.endY)
-                scriptConvolve3x3.forEach(outAllocation, options)
-            } else {
-                scriptConvolve3x3.forEach(outAllocation)
-            }
-            scriptConvolve3x3.destroy()
-        }
-        25 -> {
-            val scriptConvolve5x5 =
-                ScriptIntrinsicConvolve5x5.create(context, baseElement)
-            scriptConvolve5x5.setCoefficients(coefficients)
-            scriptConvolve5x5.setInput(inputAllocation)
-            if (restriction != null) {
-                val options = Script.LaunchOptions()
-                options.setX(restriction.startX, restriction.endX)
-                options.setY(restriction.startY, restriction.endY)
-                scriptConvolve5x5.forEach(outAllocation, options)
-            } else {
-                scriptConvolve5x5.forEach(outAllocation)
-            }
-            scriptConvolve5x5.destroy()
-        }
-        else -> {
-            throw IllegalArgumentException("RenderScriptToolkit tests. Only 3x3 and 5x5 convolutions are supported. ${coefficients.size} coefficients provided.")
-        }
-    }
-}

diff --git a/toolkit/test/IntrinsicHistogram.kt b/toolkit/test/IntrinsicHistogram.kt
deleted file mode 100644
index 25cc55d..0000000
--- a/toolkit/test/IntrinsicHistogram.kt
+++ /dev/null

@@ -1,196 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.graphics.Bitmap
-import android.renderscript.Allocation
-import android.renderscript.Element
-import android.renderscript.RenderScript
-import android.renderscript.Script
-import android.renderscript.ScriptIntrinsicHistogram
-import android.renderscript.Type
-import android.renderscript.toolkit.Range2d
-
-/**
- * Does a Histogram operation using the RenderScript Intrinsics.
- */
-fun intrinsicHistogram(
-    context: RenderScript,
-    inputArray: ByteArray,
-    vectorSize: Int,
-    sizeX: Int,
-    sizeY: Int,
-    restriction: Range2d?
-): IntArray {
-    val element = renderScriptVectorElementForU8(context, vectorSize)
-    val scriptHistogram = ScriptIntrinsicHistogram.create(context, element)
-    val builder = Type.Builder(context, element)
-    builder.setX(sizeX)
-    builder.setY(sizeY)
-    val arrayType = builder.create()
-    val inputAllocation = Allocation.createTyped(context, arrayType)
-    val outAllocation =
-        Allocation.createSized(
-            context,
-            renderScriptVectorElementForI32(context, vectorSize),
-            256
-        )
-    inputAllocation.copyFrom(inputArray)
-    scriptHistogram.setOutput(outAllocation)
-    if (restriction != null) {
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptHistogram.forEach(inputAllocation, options)
-    } else {
-        scriptHistogram.forEach(inputAllocation)
-    }
-
-    val intrinsicOutArray = IntArray(256 * paddedSize(vectorSize))
-    outAllocation.copyTo(intrinsicOutArray)
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    arrayType.destroy()
-    scriptHistogram.destroy()
-    return intrinsicOutArray
-}
-
-fun intrinsicHistogram(
-    context: RenderScript,
-    bitmap: Bitmap,
-    restriction: Range2d?
-): IntArray {
-    val baseElement = renderScriptElementForBitmap(context, bitmap)
-    val scriptHistogram = ScriptIntrinsicHistogram.create(context, baseElement)
-    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
-    inputAllocation.copyFrom(bitmap)
-    val vectorSize = vectorSizeOfBitmap(bitmap)
-    val outAllocation =
-        Allocation.createSized(
-            context,
-            renderScriptVectorElementForI32(context, vectorSize),
-            256
-        )
-    scriptHistogram.setOutput(outAllocation)
-    if (restriction != null) {
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptHistogram.forEach(inputAllocation, options)
-    } else {
-        scriptHistogram.forEach(inputAllocation)
-    }
-
-    val intrinsicOutArray = IntArray(256 * vectorSize)
-    outAllocation.copyTo(intrinsicOutArray)
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    scriptHistogram.destroy()
-    return intrinsicOutArray
-}
-
-fun intrinsicHistogramDot(
-    context: RenderScript,
-    inputArray: ByteArray,
-    vectorSize: Int,
-    sizeX: Int,
-    sizeY: Int,
-    coefficients: FloatArray?,
-    restriction: Range2d?
-): IntArray {
-    val element = renderScriptVectorElementForU8(context, vectorSize)
-    val scriptHistogram = ScriptIntrinsicHistogram.create(context, element)
-    val builder = Type.Builder(context, element)
-    builder.setX(sizeX)
-    builder.setY(sizeY)
-    val arrayType = builder.create()
-    val inputAllocation = Allocation.createTyped(context, arrayType)
-    val outAllocation =
-        Allocation.createSized(context, Element.I32(context), 256)
-    inputAllocation.copyFrom(inputArray)
-
-    if (coefficients != null) {
-        require(coefficients.size == vectorSize) {
-            "RenderScriptToolkit tests. $vectorSize coefficients are required for histogram. " +
-                "${coefficients.size} provided."
-        }
-        scriptHistogram.setDotCoefficients(
-            coefficients[0],
-            if (vectorSize > 1) coefficients[1] else 0f,
-            if (vectorSize > 2) coefficients[2] else 0f,
-            if (vectorSize > 3) coefficients[3] else 0f
-        )
-    }
-    scriptHistogram.setOutput(outAllocation)
-    if (restriction != null) {
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptHistogram.forEach_Dot(inputAllocation, options)
-    } else {
-        scriptHistogram.forEach_Dot(inputAllocation)
-    }
-    val intrinsicOutArray = IntArray(256)
-    outAllocation.copyTo(intrinsicOutArray)
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    arrayType.destroy()
-    scriptHistogram.destroy()
-    return intrinsicOutArray
-}
-
-fun intrinsicHistogramDot(
-    context: RenderScript,
-    bitmap: Bitmap,
-    coefficients: FloatArray?,
-    restriction: Range2d?
-): IntArray {
-    val baseElement = renderScriptElementForBitmap(context, bitmap)
-    val scriptHistogram = ScriptIntrinsicHistogram.create(context, baseElement)
-    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
-    inputAllocation.copyFrom(bitmap)
-    val outAllocation =
-        Allocation.createSized(context, Element.I32(context), 256)
-
-    if (coefficients != null) {
-        require(coefficients.size == 4) {
-            "RenderScriptToolkit tests. Four coefficients are required for histogram. " +
-                "${coefficients.size} provided."
-        }
-        scriptHistogram.setDotCoefficients(
-            coefficients[0],
-            coefficients[1],
-            coefficients[2],
-            coefficients[3]
-        )
-    }
-    scriptHistogram.setOutput(outAllocation)
-    if (restriction != null) {
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptHistogram.forEach_Dot(inputAllocation, options)
-    } else {
-        scriptHistogram.forEach_Dot(inputAllocation)
-    }
-    val intrinsicOutArray = IntArray(256)
-    outAllocation.copyTo(intrinsicOutArray)
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    scriptHistogram.destroy()
-    return intrinsicOutArray
-}

diff --git a/toolkit/test/IntrinsicLut.kt b/toolkit/test/IntrinsicLut.kt
deleted file mode 100644
index 1ed03ac..0000000
--- a/toolkit/test/IntrinsicLut.kt
+++ /dev/null

@@ -1,118 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.graphics.Bitmap
-import android.renderscript.Allocation
-import android.renderscript.Element
-import android.renderscript.RenderScript
-import android.renderscript.Script
-import android.renderscript.ScriptIntrinsicLUT
-import android.renderscript.Type
-import android.renderscript.toolkit.Range2d
-
-/**
- * Does a LookUpTable operation using the RenderScript Intrinsics.
- */
-@ExperimentalUnsignedTypes
-fun intrinsicLut(
-    context: RenderScript,
-    inputArray: ByteArray,
-    sizeX: Int,
-    sizeY: Int,
-    newRed: ByteArray,
-    newGreen: ByteArray,
-    newBlue: ByteArray,
-    newAlpha: ByteArray,
-    restriction: Range2d?
-): ByteArray {
-    val scriptLut: ScriptIntrinsicLUT = ScriptIntrinsicLUT.create(
-        context,
-        Element.U8_4(context)
-    )
-    val builder = Type.Builder(context, Element.U8_4(context))
-    builder.setX(sizeX)
-    builder.setY(sizeY)
-    val arrayType = builder.create()
-    val inputAllocation = Allocation.createTyped(context, arrayType)
-    val outAllocation = Allocation.createTyped(context, arrayType)
-    inputAllocation.copyFrom(inputArray)
-    val intrinsicOutArray = ByteArray(sizeX * sizeY * 4)
-
-    for (v in 0..255) {
-        scriptLut.setRed(v, newRed[v].toUByte().toInt())
-        scriptLut.setGreen(v, newGreen[v].toUByte().toInt())
-        scriptLut.setBlue(v, newBlue[v].toUByte().toInt())
-        scriptLut.setAlpha(v, newAlpha[v].toUByte().toInt())
-    }
-    if (restriction != null) {
-        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptLut.forEach(inputAllocation, outAllocation, options)
-    } else {
-        scriptLut.forEach(inputAllocation, outAllocation)
-    }
-
-    outAllocation.copyTo(intrinsicOutArray)
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    arrayType.destroy()
-    scriptLut.destroy()
-    return intrinsicOutArray
-}
-
-@ExperimentalUnsignedTypes
-fun intrinsicLut(
-    context: RenderScript,
-    bitmap: Bitmap,
-    newRed: ByteArray,
-    newGreen: ByteArray,
-    newBlue: ByteArray,
-    newAlpha: ByteArray,
-    restriction: Range2d?
-): ByteArray {
-    val baseElement = renderScriptElementForBitmap(context, bitmap)
-    val scriptLut: ScriptIntrinsicLUT = ScriptIntrinsicLUT.create(context, baseElement)
-    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
-    inputAllocation.copyFrom(bitmap)
-    val outAllocation = Allocation.createTyped(context, inputAllocation.type)
-    val intrinsicOutArray = ByteArray(bitmap.byteCount)
-
-    for (v in 0..255) {
-        scriptLut.setRed(v, newRed[v].toUByte().toInt())
-        scriptLut.setGreen(v, newGreen[v].toUByte().toInt())
-        scriptLut.setBlue(v, newBlue[v].toUByte().toInt())
-        scriptLut.setAlpha(v, newAlpha[v].toUByte().toInt())
-    }
-    if (restriction != null) {
-        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptLut.forEach(inputAllocation, outAllocation, options)
-    } else {
-        scriptLut.forEach(inputAllocation, outAllocation)
-    }
-
-    outAllocation.copyTo(intrinsicOutArray)
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    scriptLut.destroy()
-    return intrinsicOutArray
-}

diff --git a/toolkit/test/IntrinsicLut3d.kt b/toolkit/test/IntrinsicLut3d.kt
deleted file mode 100644
index 48e785e..0000000
--- a/toolkit/test/IntrinsicLut3d.kt
+++ /dev/null

@@ -1,123 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.graphics.Bitmap
-import android.renderscript.Allocation
-import android.renderscript.Element
-import android.renderscript.RenderScript
-import android.renderscript.Script
-import android.renderscript.ScriptIntrinsic3DLUT
-import android.renderscript.Type
-import android.renderscript.toolkit.Range2d
-
-/**
- * Does a 3D LookUpTable operation using the RenderScript Intrinsics.
- */
-fun intrinsicLut3d(
-    context: RenderScript,
-    inputArray: ByteArray,
-    sizeX: Int,
-    sizeY: Int,
-    cubeArray: ByteArray,
-    cubeSize: Dimension,
-    restriction: Range2d?
-): ByteArray {
-    val scriptLut3d: ScriptIntrinsic3DLUT = ScriptIntrinsic3DLUT.create(
-        context, Element.U8_4(
-            context
-        )
-    )
-    val builder = Type.Builder(context, Element.U8_4(context))
-    builder.setX(sizeX)
-    builder.setY(sizeY)
-    val arrayType = builder.create()
-    val inputAllocation = Allocation.createTyped(context, arrayType)
-    val outAllocation = Allocation.createTyped(context, arrayType)
-    inputAllocation.copyFrom(inputArray)
-    val intrinsicOutArray = ByteArray(sizeX * sizeY * 4)
-
-    val cubeTypeBuilder: Type.Builder =
-        Type.Builder(context, Element.U8_4(context))
-    cubeTypeBuilder.setX(cubeSize.sizeX)
-    cubeTypeBuilder.setY(cubeSize.sizeY)
-    cubeTypeBuilder.setZ(cubeSize.sizeZ)
-    val cubeType: Type = cubeTypeBuilder.create()
-    val cubeAllocation = Allocation.createTyped(context, cubeType)
-    cubeAllocation.copyFrom(cubeArray)
-    scriptLut3d.setLUT(cubeAllocation)
-    if (restriction != null) {
-        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptLut3d.forEach(inputAllocation, outAllocation, options)
-    } else {
-        scriptLut3d.forEach(inputAllocation, outAllocation)
-    }
-
-    outAllocation.copyTo(intrinsicOutArray)
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    cubeAllocation.destroy()
-    arrayType.destroy()
-    cubeType.destroy()
-    scriptLut3d.destroy()
-    return intrinsicOutArray
-}
-
-fun intrinsicLut3d(
-    context: RenderScript,
-    bitmap: Bitmap,
-    cubeArray: ByteArray,
-    cubeSize: Dimension,
-    restriction: Range2d?
-): ByteArray {
-    val baseElement = renderScriptElementForBitmap(context, bitmap)
-    val scriptLut3d: ScriptIntrinsic3DLUT = ScriptIntrinsic3DLUT.create(context, baseElement)
-    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
-    inputAllocation.copyFrom(bitmap)
-    val outAllocation = Allocation.createTyped(context, inputAllocation.type)
-    val intrinsicOutArray = ByteArray(bitmap.byteCount)
-
-    val cubeTypeBuilder: Type.Builder =
-        Type.Builder(context, Element.U8_4(context))
-    cubeTypeBuilder.setX(cubeSize.sizeX)
-    cubeTypeBuilder.setY(cubeSize.sizeY)
-    cubeTypeBuilder.setZ(cubeSize.sizeZ)
-    val cubeType: Type = cubeTypeBuilder.create()
-    val cubeAllocation = Allocation.createTyped(context, cubeType)
-    cubeAllocation.copyFrom(cubeArray)
-    scriptLut3d.setLUT(cubeAllocation)
-    if (restriction != null) {
-        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptLut3d.forEach(inputAllocation, outAllocation, options)
-    } else {
-        scriptLut3d.forEach(inputAllocation, outAllocation)
-    }
-
-    outAllocation.copyTo(intrinsicOutArray)
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    cubeAllocation.destroy()
-    cubeType.destroy()
-    scriptLut3d.destroy()
-    return intrinsicOutArray
-}

diff --git a/toolkit/test/IntrinsicResize.kt b/toolkit/test/IntrinsicResize.kt
deleted file mode 100644
index 5cdf89a..0000000
--- a/toolkit/test/IntrinsicResize.kt
+++ /dev/null

@@ -1,119 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.graphics.Bitmap
-import android.renderscript.Allocation
-import android.renderscript.RenderScript
-import android.renderscript.Script
-import android.renderscript.ScriptIntrinsicResize
-import android.renderscript.Type
-import android.renderscript.toolkit.Range2d
-
-/**
- * Does a Resize operation using the RenderScript Intrinsics.
- */
-fun intrinsicResize(
-    context: RenderScript,
-    inputArray: ByteArray,
-    vectorSize: Int,
-    inSizeX: Int,
-    inSizeY: Int,
-    outSizeX: Int,
-    outSizeY: Int,
-    restriction: Range2d?
-): ByteArray {
-    val scriptResize = ScriptIntrinsicResize.create(context)
-    val builder = Type.Builder(
-        context,
-        renderScriptVectorElementForU8(context, vectorSize)
-    )
-    builder.setX(inSizeX)
-    builder.setY(inSizeY)
-    val inputArrayType = builder.create()
-    val inputAllocation = Allocation.createTyped(context, inputArrayType)
-    builder.setX(outSizeX)
-    builder.setY(outSizeY)
-    val outputArrayType = builder.create()
-    val outAllocation = Allocation.createTyped(context, outputArrayType)
-    val intrinsicOutArray = ByteArray(outSizeX * outSizeY * paddedSize(vectorSize))
-
-    inputAllocation.copyFrom(inputArray)
-    scriptResize.setInput(inputAllocation)
-    if (restriction != null) {
-        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptResize.forEach_bicubic(outAllocation, options)
-    } else {
-        scriptResize.forEach_bicubic(outAllocation)
-    }
-    outAllocation.copyTo(intrinsicOutArray)
-
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    scriptResize.destroy()
-    inputArrayType.destroy()
-    outputArrayType.destroy()
-    return intrinsicOutArray
-}
-
-fun intrinsicResize(
-    context: RenderScript,
-    bitmap: Bitmap,
-    outSizeX: Int,
-    outSizeY: Int,
-    restriction: Range2d?
-): ByteArray {
-    val scriptResize = ScriptIntrinsicResize.create(context)
-    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
-    inputAllocation.copyFrom(bitmap)
-
-    val vectorSize = when (bitmap.config) {
-        Bitmap.Config.ARGB_8888 -> 4
-        Bitmap.Config.ALPHA_8 -> 1
-        else -> error("Unrecognized bitmap config $bitmap.config")
-    }
-    val builder = Type.Builder(
-        context,
-        renderScriptVectorElementForU8(context, vectorSize)
-    )
-    builder.setX(outSizeX)
-    builder.setY(outSizeY)
-    val outputArrayType = builder.create()
-    val outAllocation = Allocation.createTyped(context, outputArrayType)
-    val intrinsicOutArray = ByteArray(outSizeX * outSizeY * vectorSize)
-
-    scriptResize.setInput(inputAllocation)
-    if (restriction != null) {
-        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
-        val options = Script.LaunchOptions()
-        options.setX(restriction.startX, restriction.endX)
-        options.setY(restriction.startY, restriction.endY)
-        scriptResize.forEach_bicubic(outAllocation, options)
-    } else {
-        scriptResize.forEach_bicubic(outAllocation)
-    }
-    outAllocation.copyTo(intrinsicOutArray)
-
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    outputArrayType.destroy()
-    scriptResize.destroy()
-    return intrinsicOutArray
-}

diff --git a/toolkit/test/IntrinsicYuvToRgb.kt b/toolkit/test/IntrinsicYuvToRgb.kt
deleted file mode 100644
index 5e46f2e..0000000
--- a/toolkit/test/IntrinsicYuvToRgb.kt
+++ /dev/null

@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.graphics.ImageFormat
-import android.renderscript.Allocation
-import android.renderscript.Element
-import android.renderscript.RenderScript
-import android.renderscript.ScriptIntrinsicYuvToRGB
-import android.renderscript.Type
-import android.renderscript.toolkit.YuvFormat
-
-/**
- * Does a YUV to RGB operation using the RenderScript Intrinsics.
- */
-fun intrinsicYuvToRgb(
-    context: RenderScript,
-    inputArray: ByteArray,
-    sizeX: Int,
-    sizeY: Int,
-    format: YuvFormat
-): ByteArray {
-    val scriptYuvToRgb = ScriptIntrinsicYuvToRGB.create(
-        context,
-        Element.YUV(context)
-    )
-    val inputBuilder = Type.Builder(context, Element.YUV(context))
-    inputBuilder.setX(sizeX)
-    inputBuilder.setY(sizeY)
-    when (format) {
-        YuvFormat.NV21 -> inputBuilder.setYuvFormat(ImageFormat.NV21)
-        YuvFormat.YV12 -> inputBuilder.setYuvFormat(ImageFormat.YV12)
-        else -> require(false) { "Unknown YUV format $format" }
-    }
-    val inputArrayType = inputBuilder.create()
-    val inputAllocation = Allocation.createTyped(context, inputArrayType)
-
-    val outputBuilder = Type.Builder(context, Element.U8_4(context))
-    outputBuilder.setX(sizeX)
-    outputBuilder.setY(sizeY)
-    val outputArrayType = outputBuilder.create()
-    val outAllocation = Allocation.createTyped(context, outputArrayType)
-    val intrinsicOutArray = ByteArray(sizeX * sizeY * 4)
-
-    inputAllocation.copyFrom(inputArray)
-    scriptYuvToRgb.setInput(inputAllocation)
-    scriptYuvToRgb.forEach(outAllocation)
-    outAllocation.copyTo(intrinsicOutArray)
-
-    inputAllocation.destroy()
-    outAllocation.destroy()
-    inputArrayType.destroy()
-    outputArrayType.destroy()
-    scriptYuvToRgb.destroy()
-    return intrinsicOutArray
-}

diff --git a/toolkit/test/MainActivity.kt b/toolkit/test/MainActivity.kt
deleted file mode 100644
index 4092861..0000000
--- a/toolkit/test/MainActivity.kt
+++ /dev/null

@@ -1,55 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.os.Bundle
-import android.widget.TextView
-import androidx.appcompat.app.AppCompatActivity
-
-@ExperimentalUnsignedTypes
-class MainActivity : AppCompatActivity() {
-
-    override fun onCreate(savedInstanceState: Bundle?) {
-        super.onCreate(savedInstanceState)
-        setContentView(R.layout.activity_main)
-
-        // To debug resources not destroyed
-        // "A resource failed to call destroy."
-        try {
-            Class.forName("dalvik.system.CloseGuard")
-                .getMethod("setEnabled", Boolean::class.javaPrimitiveType)
-                .invoke(null, true)
-        } catch (e: ReflectiveOperationException) {
-            throw RuntimeException(e)
-        }
-
-        val validate = true
-        val tester = Tester(this, validate)
-        val numberOfIterations = if (validate) 1 else 28
-        val t = TimingTracker(numberOfIterations, 0)
-        for (i in 1..numberOfIterations) {
-            println("*** Iteration $i of $numberOfIterations ****")
-            //startMethodTracing("myTracing")
-            //startMethodTracingSampling("myTracing_sample", 8000000, 10)
-            val r = tester.testAll(t)
-            //stopMethodTracing()
-            findViewById<TextView>(R.id.sample_text).text = "$r\n\n${t.report()}"
-            t.nextIteration()
-        }
-        tester.destroy()
-    }
-}

diff --git a/toolkit/test/ReferenceBlend.kt b/toolkit/test/ReferenceBlend.kt
deleted file mode 100644
index ba60bc8..0000000
--- a/toolkit/test/ReferenceBlend.kt
+++ /dev/null

@@ -1,82 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.renderscript.toolkit.BlendingMode
-import android.renderscript.toolkit.Range2d
-
-/**
- * Reference implementation of a Blend operation.
- *
- * See the class Rgba for details of arithmetic operation using that class.
- */
-@ExperimentalUnsignedTypes
-fun referenceBlend(
-    mode: BlendingMode,
-    sourceArray: ByteArray,
-    destArray: ByteArray,
-    sizeX: Int,
-    sizeY: Int,
-    restriction: Range2d?
-) {
-    val source = Rgba2dArray(sourceArray, sizeX, sizeY)
-    val dest = Rgba2dArray(destArray, sizeX, sizeY)
-
-    /**
-     * For each corresponding RGBA value of the source and destination arrays, invoke the blend
-     * function and store the result in the destination array.
-     */
-    fun blendEachPair(blendFunction: (src: Rgba, dst: Rgba) -> Rgba) {
-        dest.forEachCell(restriction) { x, y ->
-            dest[x, y] = blendFunction(source[x, y], dest[x, y])
-        }
-    }
-
-    when (mode) {
-        BlendingMode.CLEAR -> blendEachPair { _, _ -> Rgba(0, 0, 0, 0) }
-        BlendingMode.SRC -> blendEachPair { src, _ -> src }
-        BlendingMode.DST -> { /* This doesn't do anything. */ }
-        BlendingMode.SRC_OVER -> blendEachPair { src, dst -> blendOver(src, dst) }
-        BlendingMode.DST_OVER -> blendEachPair { src, dst -> blendOver(dst, src) }
-        BlendingMode.SRC_IN -> blendEachPair { src, dst -> blendIn(src, dst) }
-        BlendingMode.DST_IN -> blendEachPair { src, dst -> blendIn(dst, src) }
-        BlendingMode.SRC_OUT -> blendEachPair { src, dst -> blendOut(src, dst) }
-        BlendingMode.DST_OUT -> blendEachPair { src, dst -> blendOut(dst, src) }
-        BlendingMode.SRC_ATOP -> blendEachPair { src, dst -> blendAtop(src, dst) }
-        BlendingMode.DST_ATOP -> blendEachPair { src, dst -> blendAtop(dst, src) }
-        BlendingMode.XOR -> blendEachPair { src, dst -> src xor dst }
-        BlendingMode.MULTIPLY -> blendEachPair { src, dst -> src * dst }
-        BlendingMode.ADD -> blendEachPair { src, dst -> dst + src }
-        BlendingMode.SUBTRACT -> blendEachPair { src, dst -> dst - src }
-    }
-}
-
-@ExperimentalUnsignedTypes
-private fun blendOver(src: Rgba, dst: Rgba) = src + (dst * (255 - src.a))
-
-@ExperimentalUnsignedTypes
-private fun blendIn(src: Rgba, dst: Rgba) = src * dst.a
-
-@ExperimentalUnsignedTypes
-private fun blendOut(src: Rgba, dst: Rgba) = src * (255 - dst.a)
-
-@ExperimentalUnsignedTypes
-private fun blendAtop(src: Rgba, dst: Rgba): Rgba {
-    val value = src * dst.a + dst * (255 - src.a)
-    value.a = dst.a
-    return value
-}

diff --git a/toolkit/test/ReferenceBlur.kt b/toolkit/test/ReferenceBlur.kt
deleted file mode 100644
index 66c2a05..0000000
--- a/toolkit/test/ReferenceBlur.kt
+++ /dev/null

@@ -1,131 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.renderscript.toolkit.Range2d
-import kotlin.math.max
-import kotlin.math.min
-import kotlin.math.pow
-import kotlin.math.sqrt
-
-/**
- * Reference implementation of a Blur operation.
- */
-@ExperimentalUnsignedTypes
-fun referenceBlur(inputArray: ByteArray,
-                  vectorSize: Int,
-                  sizeX: Int,
-                  sizeY: Int,
-                  radius: Int = 5, restriction: Range2d?): ByteArray {
-    val maxRadius = 25
-    require (radius in 1..maxRadius) {
-        "RenderScriptToolkit blur. Radius should be between 1 and $maxRadius. $radius provided."
-    }
-    val gaussian = buildGaussian(radius)
-
-    // Convert input data to float so that the blurring goes faster.
-    val inputValues = FloatArray(inputArray.size) { byteToUnitFloat(inputArray[it].toUByte()) }
-    val inputInFloat = FloatVector2dArray(inputValues, vectorSize, sizeX, sizeY)
-
-    val scratch = horizontalBlur(inputInFloat, gaussian, radius, restriction)
-    val outInFloat = verticalBlur(scratch, gaussian, radius, restriction)
-
-    // Convert the results back to bytes.
-    return ByteArray(outInFloat.values.size) { unitFloatClampedToUByte(outInFloat.values[it]).toByte() }
-}
-
-/**
- * Blurs along the horizontal direction using the specified gaussian weights.
- */
-private fun horizontalBlur(
-    input: FloatVector2dArray,
-    gaussian: FloatArray,
-    radius: Int,
-    restriction: Range2d?
-): FloatVector2dArray {
-    var expandedRestriction: Range2d? = null
-    if (restriction != null) {
-        // Expand the restriction in the vertical direction so that the vertical pass
-        // will have all the data it needs.
-        expandedRestriction = Range2d(
-            restriction.startX,
-            restriction.endX,
-            max(restriction.startY - radius, 0),
-            min(restriction.endY + radius, input.sizeY)
-        )
-    }
-
-    input.clipAccessToRange = true
-    val out = input.createSameSized()
-    out.forEach(expandedRestriction) { x, y ->
-        for ((gaussianIndex, delta: Int) in (-radius..radius).withIndex()) {
-            val v = input[x + delta, y] * gaussian[gaussianIndex]
-            out[x, y] += v
-        }
-    }
-    return out
-}
-
-/**
- * Blurs along the horizontal direction using the specified gaussian weights.
- */
-private fun verticalBlur(
-    input: FloatVector2dArray,
-    gaussian: FloatArray,
-    radius: Int,
-    restriction: Range2d?
-): FloatVector2dArray {
-    input.clipAccessToRange = true
-    val out = input.createSameSized()
-    out.forEach(restriction) { x, y ->
-        for ((gaussianIndex, delta: Int) in (-radius..radius).withIndex()) {
-            val v = input[x, y + delta] * gaussian[gaussianIndex]
-            out[x, y] += v
-        }
-    }
-    return out
-}
-
-/**
- * Builds an array of gaussian weights that will be used for doing the horizontal and vertical
- * blur.
- *
- * @return An array of (2 * radius + 1) floats.
- */
-private fun buildGaussian(radius: Int): FloatArray {
-    val e: Float = kotlin.math.E.toFloat()
-    val pi: Float = kotlin.math.PI.toFloat()
-    val sigma: Float = 0.4f * radius.toFloat() + 0.6f
-    val coefficient1: Float = 1.0f / (sqrt(2.0f * pi) * sigma)
-    val coefficient2: Float = -1.0f / (2.0f * sigma * sigma)
-
-    var sum = 0.0f
-    val gaussian = FloatArray(radius * 2 + 1)
-    for (r in -radius..radius) {
-        val floatR: Float = r.toFloat()
-        val v: Float = coefficient1 * e.pow(floatR * floatR * coefficient2)
-        gaussian[r + radius] = v
-        sum += v
-    }
-
-    // Normalize so that the sum of the weights equal 1f.
-    val normalizeFactor: Float = 1.0f / sum
-    for (r in -radius..radius) {
-        gaussian[r + radius] *= normalizeFactor
-    }
-    return gaussian
-}

diff --git a/toolkit/test/ReferenceColorMatrix.kt b/toolkit/test/ReferenceColorMatrix.kt
deleted file mode 100644
index 75f93af..0000000
--- a/toolkit/test/ReferenceColorMatrix.kt
+++ /dev/null

@@ -1,57 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.renderscript.toolkit.Range2d
-
-/**
- * Reference implementation of a ColorMatrix operation.
- */
-@ExperimentalUnsignedTypes
-fun referenceColorMatrix(inputArray: ByteArray,
-                         inputVectorSize: Int,
-                         sizeX: Int,
-                         sizeY: Int,
-                         outputVectorSize: Int,
-                         matrix: FloatArray, addVector: FloatArray,
-                         restriction: Range2d?): ByteArray {
-    require (matrix.size == 16) { "RenderScriptToolkit colorMatrix. Matrix should have 16 values. ${matrix.size} provided." }
-
-    val input = Vector2dArray(inputArray.asUByteArray(), inputVectorSize, sizeX, sizeY)
-    val outputArray = ByteArray(sizeX * sizeY * paddedSize(outputVectorSize))
-    val output = Vector2dArray(outputArray.asUByteArray(), outputVectorSize, sizeX, sizeY)
-
-    output.forEach (restriction) { x, y ->
-        val inUByteValue = input[x, y]
-        val inFloatValue = FloatArray(4) { if (it >= inputVectorSize) 0f else byteToUnitFloat(inUByteValue[it]) }
-        val outFloatValue = multiplyAndAdd(matrix, inFloatValue, addVector)
-        val outUByteValue = UByteArray(paddedSize(output.vectorSize)) { unitFloatClampedToUByte(outFloatValue[it]) }
-        output[x, y] = outUByteValue
-    }
-    return outputArray
-}
-
-private fun multiplyAndAdd(matrix: FloatArray, inVector: FloatArray, addVector: FloatArray): FloatArray {
-    // In RenderScript, matrix were set in column major format
-    val result = addVector.clone()
-    for (i in 0..3) {
-        for (j in 0..3) {
-            result[i] += matrix[j * 4 + i] * inVector[j]
-        }
-    }
-    return result
-}

diff --git a/toolkit/test/ReferenceConvolve.kt b/toolkit/test/ReferenceConvolve.kt
deleted file mode 100644
index b9181a9..0000000
--- a/toolkit/test/ReferenceConvolve.kt
+++ /dev/null

@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.renderscript.toolkit.Range2d
-
-/**
- * Reference implementation of a Convolve operation.
- */
-@ExperimentalUnsignedTypes
-fun referenceConvolve(
-    inputArray: ByteArray,
-    vectorSize: Int,
-    sizeX: Int,
-    sizeY: Int,
-    coefficients: FloatArray,
-    restriction: Range2d?
-): ByteArray {
-    val input = Vector2dArray(inputArray.asUByteArray(), vectorSize, sizeX, sizeY)
-    val radius = when (coefficients.size) {
-        9 -> 1
-        25 -> 2
-        else -> {
-            throw IllegalArgumentException("RenderScriptToolkit Convolve. Only 3x3 and 5x5 convolutions are supported. ${coefficients.size} coefficients provided.")
-        }
-    }
-
-    input.clipReadToRange = true
-    val output = input.createSameSized()
-    input.forEach(restriction) { x, y ->
-        output[x, y] = convolveOne(input, x, y, coefficients, radius)
-    }
-    return output.values.asByteArray()
-}
-
-@ExperimentalUnsignedTypes
-private fun convolveOne(
-    inputAlloc: Vector2dArray,
-    x: Int,
-    y: Int,
-    coefficients: FloatArray,
-    radius: Int
-): UByteArray {
-    var sum = FloatArray(paddedSize(inputAlloc.vectorSize))
-    var coefficientIndex = 0
-    for (deltaY in -radius..radius) {
-        for (deltaX in -radius..radius) {
-            val inputVector = inputAlloc[x + deltaX, y + deltaY]
-            sum += inputVector.toFloatArray() * coefficients[coefficientIndex]
-            coefficientIndex++
-        }
-    }
-    return sum.clampToUByte()
-}

diff --git a/toolkit/test/ReferenceHistogram.kt b/toolkit/test/ReferenceHistogram.kt
deleted file mode 100644
index 6bd9167..0000000
--- a/toolkit/test/ReferenceHistogram.kt
+++ /dev/null

@@ -1,98 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.renderscript.toolkit.Range2d
-
-/**
- * Reference implementation of a Histogram operation.
- *
- * Return an array of 4 * 256 ints.
- * Position 0 is the number of R with a value of 0,
- * Position 1 is the number of G with a value of 0,
- * Position 2 is the number of B with a value of 0,
- * Position 3 is the number of A with a value of 0,
- * Position 4 is the number of R with a value of 1,
- * etc.
-*/
-@ExperimentalUnsignedTypes
-fun referenceHistogram(
-    inputArray: ByteArray,
-    vectorSize: Int,
-    sizeX: Int,
-    sizeY: Int,
-    restriction: Range2d?
-): IntArray {
-    val input = Vector2dArray(inputArray.asUByteArray(), vectorSize, sizeX, sizeY)
-
-    val counts = IntArray(paddedSize(input.vectorSize) * 256)
-    input.forEach(restriction) { x, y ->
-        val value = input[x, y]
-        for (i in 0 until vectorSize) {
-            counts[value[i].toInt() * paddedSize(input.vectorSize) + i]++
-        }
-    }
-    return counts
-}
-
-/**
- * Reference implementation of a HistogramDot operation.
- *
- * Each RGBA input value is dot-multiplied first by the specified coefficients.
- * The resulting value is converted to an integer and used for the histogram.
- */
-@ExperimentalUnsignedTypes
-fun referenceHistogramDot(
-    inputArray: ByteArray,
-    vectorSize: Int,
-    sizeX: Int,
-    sizeY: Int,
-    coefficients: FloatArray?,
-    restriction: Range2d?
-): IntArray {
-    val floatCoefficients = coefficients ?: floatArrayOf(0.299f, 0.587f, 0.114f, 0f)
-    val input = Vector2dArray(inputArray.asUByteArray(), vectorSize, sizeX, sizeY)
-    var coefficientSum = 0f
-    for (c in floatCoefficients) {
-        require (c >= 0) {
-            "RenderScriptToolkit histogramDot. Coefficients must be positive. $c provided."
-        }
-        coefficientSum += c
-    }
-    require(coefficientSum <= 1f) { "RenderScriptToolkit histogramDot. Coefficients should " +
-            "add to 1.0 or less. $coefficientSum provided." }
-
-    // Compute integer
-    val intCoefficients = IntArray(input.vectorSize) { (floatCoefficients[it] * 256f + 0.5f).toInt() }
-
-    val counts = IntArray(256)
-    input.forEach(restriction) { x, y ->
-        val value = input[x, y]
-        // While we could do the computation using floats, we won't get the same results as
-        // the existing intrinsics.
-        var sum = 0
-        // We don't use value.indices because we want to accumulate only 3 values, in the case
-        // of vectorSize == 3.
-        for (i in 0 until vectorSize) {
-            sum += intCoefficients[i] * value[i].toInt()
-        }
-        // Round up and normalize
-        val index = (sum + 0x7f) shr 8
-        counts[index]++
-    }
-    return counts
-}

diff --git a/toolkit/test/ReferenceLut.kt b/toolkit/test/ReferenceLut.kt
deleted file mode 100644
index cd832f0..0000000
--- a/toolkit/test/ReferenceLut.kt
+++ /dev/null

@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.renderscript.toolkit.LookupTable
-import android.renderscript.toolkit.Range2d
-
-/**
- * Reference implementation of a LookUpTable operation.
- */
-@ExperimentalUnsignedTypes
-fun referenceLut(
-    inputArray: ByteArray,
-    sizeX: Int,
-    sizeY: Int,
-    table: LookupTable,
-    restriction: Range2d?
-): ByteArray {
-    val input = Vector2dArray(inputArray.asUByteArray(), 4, sizeX, sizeY)
-
-    val output = input.createSameSized()
-    input.forEach(restriction) { x, y ->
-        val oldValue = input[x, y]
-        val newValue = byteArrayOf(
-            table.red[oldValue[0].toInt()],
-            table.green[oldValue[1].toInt()],
-            table.blue[oldValue[2].toInt()],
-            table.alpha[oldValue[3].toInt()]
-        )
-        output[x, y] = newValue.asUByteArray()
-    }
-    return output.values.asByteArray()
-}
-

diff --git a/toolkit/test/ReferenceLut3d.kt b/toolkit/test/ReferenceLut3d.kt
deleted file mode 100644
index afd977b..0000000
--- a/toolkit/test/ReferenceLut3d.kt
+++ /dev/null

@@ -1,74 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.renderscript.toolkit.Range2d
-import android.renderscript.toolkit.Rgba3dArray
-
-/**
- * Reference implementation of a 3D LookUpTable operation.
- */
-@ExperimentalUnsignedTypes
-fun referenceLut3d(
-    inputArray: ByteArray,
-    sizeX: Int,
-    sizeY: Int,
-    cube: Rgba3dArray,
-    restriction: Range2d?
-): ByteArray {
-    val input = Vector2dArray(inputArray.asUByteArray(), 4, sizeX, sizeY)
-    val output = input.createSameSized()
-    input.forEach(restriction) { x, y ->
-        output[x, y] = lookup(input[x, y], cube)
-    }
-    return output.values.asByteArray()
-}
-
-@ExperimentalUnsignedTypes
-private fun lookup(input: UByteArray, cube: Rgba3dArray): UByteArray {
-    // Calculate the two points at opposite edges of the size 1
-    // cube that contains our point.
-    val maxIndex = Int4(cube.sizeX - 1, cube.sizeY - 1, cube.sizeZ - 1, 0)
-    val baseCoordinate: Float4 = input.toFloat4() * maxIndex.toFloat4() / 255f
-    val point1: Int4 = baseCoordinate.intFloor()
-    val point2: Int4 = min(point1 + 1, maxIndex)
-    val fractionAwayFromPoint1: Float4 = baseCoordinate - point1.toFloat4()
-
-    // Get the RGBA values at each of the four corners of the size 1 cube.
-    val v000 = cube[point1.x, point1.y, point1.z].toFloat4()
-    val v100 = cube[point2.x, point1.y, point1.z].toFloat4()
-    val v010 = cube[point1.x, point2.y, point1.z].toFloat4()
-    val v110 = cube[point2.x, point2.y, point1.z].toFloat4()
-    val v001 = cube[point1.x, point1.y, point2.z].toFloat4()
-    val v101 = cube[point2.x, point1.y, point2.z].toFloat4()
-    val v011 = cube[point1.x, point2.y, point2.z].toFloat4()
-    val v111 = cube[point2.x, point2.y, point2.z].toFloat4()
-
-    // Do the linear mixing of these eight values.
-    val yz00 = mix(v000, v100, fractionAwayFromPoint1.x)
-    val yz10 = mix(v010, v110, fractionAwayFromPoint1.x)
-    val yz01 = mix(v001, v101, fractionAwayFromPoint1.x)
-    val yz11 = mix(v011, v111, fractionAwayFromPoint1.x)
-
-    val z0 = mix(yz00, yz10, fractionAwayFromPoint1.y)
-    val z1 = mix(yz01, yz11, fractionAwayFromPoint1.y)
-
-    val v = mix(z0, z1, fractionAwayFromPoint1.z)
-
-    // Preserve the alpha of the original value
-    return ubyteArrayOf(v.x.clampToUByte(), v.y.clampToUByte(), v.z.clampToUByte(), input[3])
-}

diff --git a/toolkit/test/ReferenceResize.kt b/toolkit/test/ReferenceResize.kt
deleted file mode 100644
index 023825e..0000000
--- a/toolkit/test/ReferenceResize.kt
+++ /dev/null

@@ -1,157 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.renderscript.toolkit.Range2d
-import kotlin.math.floor
-import kotlin.math.max
-
-var trace = false
-
-/**
- * Reference implementation of a Resize operation.
- */
-@ExperimentalUnsignedTypes
-fun referenceResize(inputArray: ByteArray,
-                    vectorSize: Int,
-                    inSizeX: Int,
-                    inSizeY: Int,
-                    outSizeX: Int, outSizeY: Int,
-                    restriction: Range2d?): ByteArray {
-    val input = Vector2dArray(inputArray.asUByteArray(), vectorSize, inSizeX, inSizeY)
-    val scaleX: Float = input.sizeX.toFloat() / outSizeX.toFloat()
-    val scaleY: Float = input.sizeY.toFloat() / outSizeY.toFloat()
-    val outArray = UByteArray(outSizeX * outSizeY * paddedSize(input.vectorSize))
-    val out = Vector2dArray(outArray, input.vectorSize, outSizeX, outSizeY)
-    out.forEach (restriction) { x, y ->
-        if (x == 1827 && y == 46) {
-            println("Found it")
-            trace = true
-        }
-        val o = bicubicU4(x, y, input, scaleX, scaleY)
-        out[x, y] = o.clampToUByte()
-    }
-    return out.values.asByteArray()
-}
-
-private fun cubicInterpolateF(p0: FloatArray, p1: FloatArray, p2: FloatArray, p3: FloatArray,
-                              x: Float): FloatArray {
-    return p1 + (p2 - p0 + (p0 * 2f - p1 * 5f + p2 * 4f - p3
-            + ((p1 - p2) * 3f + p3 - p0) * x) * x) * x * 0.5f
-}
-
-@ExperimentalUnsignedTypes
-private fun bicubicU4(x: Int, y: Int, gIn: Vector2dArray, scaleX: Float, scaleY: Float): FloatArray {
-    var xf: Float = (x + 0.5f) * scaleX - 0.5f
-    var yf: Float = (y + 0.5f) * scaleY - 0.5f
-
-    val startX: Int = floor(xf - 1).toInt()
-    val startY: Int = floor(yf - 1).toInt()
-    xf -= floor(xf)
-    yf -= floor(yf)
-    val maxX: Int = gIn.sizeX - 1
-    val maxY: Int = gIn.sizeY - 1
-
-    val xs0: Int = max(0, startX + 0)
-    val xs1: Int = max(0, startX + 1)
-    val xs2: Int = kotlin.math.min(maxX, startX + 2)
-    val xs3: Int = kotlin.math.min(maxX, startX + 3)
-
-    val ys0: Int = max(0, startY + 0)
-    val ys1: Int = max(0, startY + 1)
-    val ys2: Int = kotlin.math.min(maxY, startY + 2)
-    val ys3: Int = kotlin.math.min(maxY, startY + 3)
-
-    val p00 = gIn[xs0, ys0].toFloatArray()
-    val p01 = gIn[xs1, ys0].toFloatArray()
-    val p02 = gIn[xs2, ys0].toFloatArray()
-    val p03 = gIn[xs3, ys0].toFloatArray()
-    val p0  = cubicInterpolateF(p00, p01, p02, p03, xf)
-
-    val p10 = gIn[xs0, ys1].toFloatArray()
-    val p11 = gIn[xs1, ys1].toFloatArray()
-    val p12 = gIn[xs2, ys1].toFloatArray()
-    val p13 = gIn[xs3, ys1].toFloatArray()
-    val p1  = cubicInterpolateF(p10, p11, p12, p13, xf)
-
-    val p20 = gIn[xs0, ys2].toFloatArray()
-    val p21 = gIn[xs1, ys2].toFloatArray()
-    val p22 = gIn[xs2, ys2].toFloatArray()
-    val p23 = gIn[xs3, ys2].toFloatArray()
-    val p2  = cubicInterpolateF(p20, p21, p22, p23, xf)
-
-    val p30 = gIn[xs0, ys3].toFloatArray()
-    val p31 = gIn[xs1, ys3].toFloatArray()
-    val p32 = gIn[xs2, ys3].toFloatArray()
-    val p33 = gIn[xs3, ys3].toFloatArray()
-    val p3  = cubicInterpolateF(p30, p31, p32, p33, xf)
-
-    return cubicInterpolateF(p0, p1, p2, p3, yf)
-}
-
-
-/* To be used if we implement Floats
-private fun bicubic_F4(x: Int, y: Int, gin: ByteArray, sizeX: Int, sizeY: Int, scaleX: Float, scaleY: Float): Float4 {
-    var xf: Float = (x + 0.5f) * scaleX - 0.5f
-    var yf: Float = (y + 0.5f) * scaleY - 0.5f
-
-    val startX: Int = floor(xf - 1).toInt()
-    val startY: Int = floor(yf - 1).toInt()
-    xf = xf - floor(xf)
-    yf = yf - floor(yf)
-    val maxX: Int = sizeX - 1
-    val maxY: Int = sizeY - 1
-
-    val xs0: Int = max(0, startX + 0)
-    val xs1: Int = max(0, startX + 1)
-    val xs2: Int = min(maxX, startX + 2)
-    val xs3: Int = min(maxX, startX + 3)
-
-    val ys0: Int = max(0, startY + 0)
-    val ys1: Int = max(0, startY + 1)
-    val ys2: Int = min(maxY, startY + 2)
-    val ys3: Int = min(maxY, startY + 3)
-
-    val p00: Float4 = rsGetElementAt_Float4(gIn, xs0, ys0)
-    val p01: Float4 = rsGetElementAt_Float4(gIn, xs1, ys0)
-    val p02: Float4 = rsGetElementAt_Float4(gIn, xs2, ys0)
-    val p03: Float4 = rsGetElementAt_Float4(gIn, xs3, ys0)
-    val p0: Float4  = cubicInterpolate_F4(p00, p01, p02, p03, xf)
-
-    val p10: Float4 = rsGetElementAt_Float4(gIn, xs0, ys1)
-    val p11: Float4 = rsGetElementAt_Float4(gIn, xs1, ys1)
-    val p12: Float4 = rsGetElementAt_Float4(gIn, xs2, ys1)
-    val p13: Float4 = rsGetElementAt_Float4(gIn, xs3, ys1)
-    val p1: Float4  = cubicInterpolate_F4(p10, p11, p12, p13, xf)
-
-    val p20: Float4 = rsGetElementAt_Float4(gIn, xs0, ys2)
-    val p21: Float4 = rsGetElementAt_Float4(gIn, xs1, ys2)
-    val p22: Float4 = rsGetElementAt_Float4(gIn, xs2, ys2)
-    val p23: Float4 = rsGetElementAt_Float4(gIn, xs3, ys2)
-    val p2: Float4  = cubicInterpolate_F4(p20, p21, p22, p23, xf)
-
-    val p30: Float4 = rsGetElementAt_Float4(gIn, xs0, ys3)
-    val p31: Float4 = rsGetElementAt_Float4(gIn, xs1, ys3)
-    val p32: Float4 = rsGetElementAt_Float4(gIn, xs2, ys3)
-    val p33: Float4 = rsGetElementAt_Float4(gIn, xs3, ys3)
-    val p3: Float4  = cubicInterpolate_F4(p30, p31, p32, p33, xf)
-
-    val p: Float4  = cubicInterpolate_F4(p0, p1, p2, p3, yf)
-
-    return p
-}
-*/

diff --git a/toolkit/test/ReferenceYuvToRgb.kt b/toolkit/test/ReferenceYuvToRgb.kt
deleted file mode 100644
index 4d91cf6..0000000
--- a/toolkit/test/ReferenceYuvToRgb.kt
+++ /dev/null

@@ -1,112 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-import android.renderscript.toolkit.YuvFormat
-import java.lang.IllegalArgumentException
-
-/**
- * Reference implementation of a YUV to RGB operation.
- */
-@ExperimentalUnsignedTypes
-fun referenceYuvToRgb(inputSignedArray: ByteArray, sizeX: Int, sizeY: Int, format: YuvFormat): ByteArray {
-    require(sizeX % 2 == 0) { "The width of the input should be even."}
-    val inputArray = inputSignedArray.asUByteArray()
-
-    val outputArray = ByteArray(sizeX * sizeY * 4)
-    val output = Vector2dArray(outputArray.asUByteArray(), 4, sizeX, sizeY)
-
-    when (format) {
-        YuvFormat.NV21 -> {
-            val startY = 0
-            val startU = sizeX * sizeY + 1
-            val startV = sizeX * sizeY
-
-            for (y in 0 until sizeY) {
-                for (x in 0 until sizeX) {
-                    val offsetY = y * sizeX + x
-                    val offsetU = ((y shr 1) * sizeX + (x shr 1) * 2)
-                    val offsetV = ((y shr 1) * sizeX + (x shr 1) * 2)
-                    output[x, y] = yuvToRGBA4(
-                        inputArray[startY + offsetY],
-                        inputArray[startU + offsetU],
-                        inputArray[startV + offsetV]
-                    )
-                }
-            }
-        }
-
-        YuvFormat.YV12 -> {
-            /* According to https://developer.android.com/reference/kotlin/android/graphics/ImageFormat#yv12,
-             * strideX and strideUV should be aligned to 16 byte boundaries. If we do this, we
-             * won't get the same results as RenderScript.
-             *
-             * We may want to test & require that sizeX is a multiple of 16/32.
-             */
-            val strideX = roundUpTo16(sizeX) // sizeX //
-            val strideUV = roundUpTo16(strideX / 2) // strideX / 2 //
-            val startY = 0
-            val startU = strideX * sizeY
-            val startV = startU + strideUV * sizeY / 2
-
-            for (y in 0 until sizeY) {
-                for (x in 0 until sizeX) {
-                    val offsetY = y * sizeX + x
-                    val offsetUV = (y shr 1) * strideUV + (x shr 1)
-                    output[x, y] = yuvToRGBA4(
-                        inputArray[startY + offsetY],
-                        inputArray[startU + offsetUV],
-                        inputArray[startV + offsetUV],
-                    )
-                }
-            }
-        }
-        else -> throw IllegalArgumentException("Unknown YUV format $format")
-    }
-
-    return outputArray
-}
-
-@ExperimentalUnsignedTypes
-private fun yuvToRGBA4(y: UByte, u: UByte, v: UByte): UByteArray {
-    val intY = y.toInt() - 16
-    val intU = u.toInt() - 128
-    val intV = v.toInt() - 128
-    val p = intArrayOf(
-        intY * 298 + intV * 409 + 128 shr 8,
-        intY * 298 - intU * 100 - intV * 208 + 128 shr 8,
-        intY * 298 + intU * 516 + 128 shr 8,
-        255
-    )
-    return UByteArray(4) { p[it].clampToUByte() }
-}
-
-/* To be used if we support Float
-private fun yuvToRGBA_f4(y: UByte, u: UByte, v: UByte): UByteArray {
-    val yuv_U_values = floatArrayOf(0f, -0.392f * 0.003921569f, 2.02f * 0.003921569f, 0f)
-    val yuv_V_values = floatArrayOf(1.603f * 0.003921569f, -0.815f * 0.003921569f, 0f, 0f)
-
-    var color = FloatArray(4) {y.toFloat() * 0.003921569f}
-    val fU = FloatArray(4) {u.toFloat() - 128f}
-    val fV = FloatArray(4) {v.toFloat() - 128f}
-
-    color += fU * yuv_U_values;
-    color += fV * yuv_V_values;
-    //color = clamp(color, 0.f, 1.f);
-    return UByteArray(4) { unitFloatClampedToUByte(color[it]) }
-}
-*/

diff --git a/toolkit/test/TimingTracker.kt b/toolkit/test/TimingTracker.kt
deleted file mode 100644
index 81e90f2..0000000
--- a/toolkit/test/TimingTracker.kt
+++ /dev/null

@@ -1,58 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.example.testapp
-
-class TimingTracker(
-    private val numberOfIterations: Int = 1,
-    private var numberOfIterationsToIgnore: Int = 0
-) {
-    init {
-        require(numberOfIterations > numberOfIterationsToIgnore)
-    }
-    private val timings = mutableMapOf<String, IntArray>()
-    private var currentIteration: Int = 0
-    fun nextIteration() {
-        currentIteration++
-    }
-    fun <T> measure(name: String, workToTime: () -> T): T {
-        val start = System.nanoTime()
-        val t = workToTime()
-        if (currentIteration >= numberOfIterationsToIgnore) {
-            val end = System.nanoTime()
-            val deltaInMicroseconds: Int = ((end - start) / 1000).toInt()
-            val timing = timings.getOrPut(name) {
-                IntArray(numberOfIterations - numberOfIterationsToIgnore)
-            }
-            timing[currentIteration - numberOfIterationsToIgnore] += deltaInMicroseconds
-        }
-        return t
-    }
-    fun report(): String {
-        var minimum: Int = Int.MAX_VALUE
-        for (timing in timings.values) {
-            val m = timing.minOrNull()
-            if (m != null && m < minimum) minimum = m
-        }
-
-        println(timings.map { (name, timing) -> name + ": " + timing.minOrNull() }.joinToString(separator = "\n"))
-
-        return (timings.map { (name, timing) -> name + ": " + timing.joinToString() }.joinToString() + "\n\n" +
-                timings.map { (name, timing) -> name + ": " + timing.joinToString { "%.2f".format(it.toFloat() / minimum) } }.joinToString() + "\n\n" +
-                timings.map { (name, timing) -> name + ": " + timing.minOrNull() }.joinToString())
-    }
-}
-

diff --git a/toolkit/test/res/drawable-nodpi/img800x450a.jpg b/toolkit/test/res/drawable-nodpi/img800x450a.jpg
deleted file mode 100644
index 6d5b623..0000000
--- a/toolkit/test/res/drawable-nodpi/img800x450a.jpg
+++ /dev/null
Binary files differ

diff --git a/toolkit/test/res/drawable-nodpi/img800x450b.jpg b/toolkit/test/res/drawable-nodpi/img800x450b.jpg
deleted file mode 100644
index 2013e07..0000000
--- a/toolkit/test/res/drawable-nodpi/img800x450b.jpg
+++ /dev/null
Binary files differ

diff --git a/toolkit/x86.cpp b/toolkit/x86.cpp
deleted file mode 100644
index d25c3d7..0000000
--- a/toolkit/x86.cpp
+++ /dev/null

@@ -1,1323 +0,0 @@
-/*
- * Copyright (C) 2011 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdint.h>
-#include <x86intrin.h>
-
-namespace android {
-namespace renderscript {
-
-/* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
-static inline __m128i cvtepu8_epi32(__m128i x) {
-#if defined(__SSE4_1__)
-    return _mm_cvtepu8_epi32(x);
-#elif defined(__SSSE3__)
-    const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
-    x = _mm_shuffle_epi8(x, M8to32);
-    return x;
-#else
-#   error "Require at least SSSE3"
-#endif
-}
-
-static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
-#if defined(__SSE4_1__)
-    return _mm_packus_epi32(lo, hi);
-#elif defined(__SSSE3__)
-    const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
-    const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
-    const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
-    const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
-    lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
-    lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
-    hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
-    hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
-    return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
-                        _mm_shuffle_epi8(hi, M32to16H));
-#else
-#   error "Require at least SSSE3"
-#endif
-}
-
-static inline __m128i mullo_epi32(__m128i x, __m128i y) {
-#if defined(__SSE4_1__)
-    return _mm_mullo_epi32(x, y);
-#elif defined(__SSSE3__)
-    const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
-    __m128i even = _mm_mul_epu32(x, y);
-    __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
-                                _mm_srli_si128(y, 4));
-    even = _mm_and_si128(even, Meven);
-    odd = _mm_and_si128(odd, Meven);
-    return _mm_or_si128(even, _mm_slli_si128(odd, 4));
-#else
-#   error "Require at least SSSE3"
-#endif
-}
-
-/* 'mask' must packed 8-bit of 0x00 or 0xff */
-static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
-#if defined(__SSE4_1__)
-    return _mm_blendv_epi8(x, y, mask);
-#elif defined(__SSSE3__)
-    return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
-#else
-#   error "Require at least SSSE3"
-#endif
-}
-
-extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
-                                          const void *y1, const void *y2,
-                                          const short *coef, uint32_t count) {
-    __m128i x;
-    __m128i c0, c2, c4, c6, c8;
-    __m128i r0, r1, r2;
-    __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
-    __m128i o0, o1;
-    uint32_t i;
-
-    x = _mm_loadl_epi64((const __m128i *)(coef+0));
-    c0 = _mm_shuffle_epi32(x, 0x00);
-    c2 = _mm_shuffle_epi32(x, 0x55);
-    x = _mm_loadl_epi64((const __m128i *)(coef+4));
-    c4 = _mm_shuffle_epi32(x, 0x00);
-    c6 = _mm_shuffle_epi32(x, 0x55);
-    x = _mm_loadl_epi64((const __m128i *)(coef+8));
-    c8 = _mm_shuffle_epi32(x, 0x00);
-
-    for (i = 0; i < count; ++i) {
-
-        p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
-        p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
-        p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
-        p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
-        p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
-        p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
-        p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
-        p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
-        p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
-        p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
-        p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
-        p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
-
-        o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
-        o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
-
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
-
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
-
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
-
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
-
-        o0 = _mm_srai_epi32(o0, 8);
-        o1 = _mm_srai_epi32(o1, 8);
-
-        o0 = packus_epi32(o0, o1);
-        o0 = _mm_packus_epi16(o0, o0);
-        _mm_storel_epi64((__m128i *)dst, o0);
-
-        y0 = (const char *)y0 + 8;
-        y1 = (const char *)y1 + 8;
-        y2 = (const char *)y2 + 8;
-        dst = (char *)dst + 8;
-    }
-}
-
-void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
-                                  const short *coef, uint32_t count) {
-    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
-                                      14, 10, 6, 2,
-                                      13,  9, 5, 1,
-                                      12,  8, 4, 0);
-
-    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
-    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
-    __m128i c0, c1, c2, c3;
-    __m128i i4, o4;
-    __m128i xy, zw;
-    __m128i x2, y2, z2, w2;
-    uint32_t i;
-
-    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
-    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
-    c0 = _mm_unpacklo_epi16(c0, c1);
-
-    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
-    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
-    c2 = _mm_unpacklo_epi16(c2, c3);
-
-    for (i = 0; i < count; ++i) {
-        i4 = _mm_load_si128((const __m128i *)src);
-        xy = _mm_shuffle_epi8(i4, Mxy);
-        zw = _mm_shuffle_epi8(i4, Mzw);
-
-        x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
-        y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
-        z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
-        w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
-
-        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
-        y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
-        z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
-        w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
-
-        x2 = _mm_srai_epi32(x2, 8);
-        y2 = _mm_srai_epi32(y2, 8);
-        z2 = _mm_srai_epi32(z2, 8);
-        w2 = _mm_srai_epi32(w2, 8);
-
-        x2 = packus_epi32(x2, y2);
-        z2 = packus_epi32(z2, w2);
-        o4 = _mm_packus_epi16(x2, z2);
-
-        o4 = _mm_shuffle_epi8(o4, T4x4);
-        _mm_storeu_si128((__m128i *)dst, o4);
-
-        src = (const char *)src + 16;
-        dst = (char *)dst + 16;
-    }
-}
-
-void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
-                                  const short *coef, uint32_t count) {
-    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
-                                      14, 10, 6, 2,
-                                      13,  9, 5, 1,
-                                      12,  8, 4, 0);
-
-    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
-    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
-
-    __m128i c0, c1, c2, c3;
-    __m128i i4, o4;
-    __m128i xy, zw;
-    __m128i x2, y2, z2, w2;
-    uint32_t i;
-
-    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
-    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
-    c0 = _mm_unpacklo_epi16(c0, c1);
-
-    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
-    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
-    c2 = _mm_unpacklo_epi16(c2, c3);
-
-    for (i = 0; i < count; ++i) {
-        i4 = _mm_loadu_si128((const __m128i *)src);
-        xy = _mm_shuffle_epi8(i4, Mxy);
-        zw = _mm_shuffle_epi8(i4, Mzw);
-
-        x2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
-        y2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
-        z2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
-
-        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
-        y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
-        z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
-
-        x2 = _mm_srai_epi32(x2, 8);
-        y2 = _mm_srai_epi32(y2, 8);
-        z2 = _mm_srai_epi32(z2, 8);
-        w2 = _mm_srli_epi32(zw, 16);
-
-        x2 = packus_epi32(x2, y2);
-        z2 = packus_epi32(z2, w2);
-        o4 = _mm_packus_epi16(x2, z2);
-
-        o4 = _mm_shuffle_epi8(o4, T4x4);
-        _mm_storeu_si128((__m128i *)dst, o4);
-
-        src = (const char *)src + 16;
-        dst = (char *)dst + 16;
-    }
-}
-
-void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
-                                  const short *coef, uint32_t count) {
-    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
-                                      14, 10, 6, 2,
-                                      13,  9, 5, 1,
-                                      12,  8, 4, 0);
-    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
-    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
-    __m128i c0, c1, c2, c3;
-    __m128i i4, o4;
-    __m128i xy, zw;
-    __m128i x2, y2, z2, w2;
-    uint32_t i;
-
-    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
-    c0 = _mm_shufflelo_epi16(c0, 0);
-    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
-    c1 = _mm_shufflelo_epi16(c1, 0);
-    c0 = _mm_unpacklo_epi16(c0, c1);
-
-    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
-    c2 = _mm_shufflelo_epi16(c2, 0);
-    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
-    c3 = _mm_shufflelo_epi16(c3, 0);
-    c2 = _mm_unpacklo_epi16(c2, c3);
-
-    for (i = 0; i < count; ++i) {
-        i4 = _mm_loadu_si128((const __m128i *)src);
-
-        xy = _mm_shuffle_epi8(i4, Mxy);
-        zw = _mm_shuffle_epi8(i4, Mzw);
-
-        x2 =  _mm_madd_epi16(xy, c0);
-        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
-
-        x2 = _mm_srai_epi32(x2, 8);
-        y2 = x2;
-        z2 = x2;
-        w2 = _mm_srli_epi32(zw, 16);
-
-        x2 = packus_epi32(x2, y2);
-        z2 = packus_epi32(z2, w2);
-        o4 = _mm_packus_epi16(x2, z2);
-
-        o4 = _mm_shuffle_epi8(o4, T4x4);
-        _mm_storeu_si128((__m128i *)dst, o4);
-
-        src = (const char *)src + 16;
-        dst = (char *)dst + 16;
-    }
-}
-
-void rsdIntrinsicBlurVFU4_K(void *dst,
-                          const void *pin, int stride, const void *gptr,
-                          int rct, int x1, int x2) {
-    const char *pi;
-    __m128i pi0, pi1;
-    __m128 pf0, pf1;
-    __m128 bp0, bp1;
-    __m128 x;
-    int r;
-
-    for (; x1 < x2; x1 += 2) {
-        pi = (const char *)pin + (x1 << 2);
-        bp0 = _mm_setzero_ps();
-        bp1 = _mm_setzero_ps();
-
-        for (r = 0; r < rct; ++r) {
-            x = _mm_load_ss((const float *)gptr + r);
-            x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
-
-            pi0 = _mm_cvtsi32_si128(*(const int *)pi);
-            pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
-
-            pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
-            pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
-
-            bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
-            bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
-
-            pi += stride;
-        }
-
-        _mm_storeu_ps((float *)dst, bp0);
-        _mm_storeu_ps((float *)dst + 4, bp1);
-        dst = (char *)dst + 32;
-    }
-}
-
-void rsdIntrinsicBlurHFU4_K(void *dst,
-                          const void *pin, const void *gptr,
-                          int rct, int x1, int x2) {
-    const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
-    const float *pi;
-    __m128 pf, x, y;
-    __m128i o;
-    int r;
-
-    for (; x1 < x2; ++x1) {
-        /* rct is define as 2*r+1 by the caller */
-        x = _mm_load_ss((const float *)gptr);
-        x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
-
-        pi = (const float *)pin + (x1 << 2);
-        pf = _mm_mul_ps(x, _mm_load_ps(pi));
-
-        for (r = 1; r < rct; r += 2) {
-            x = _mm_load_ss((const float *)gptr + r);
-            y = _mm_load_ss((const float *)gptr + r + 1);
-            x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
-            y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
-
-            pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
-            pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
-        }
-
-        o = _mm_cvtps_epi32(pf);
-        *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
-        dst = (char *)dst + 4;
-    }
-}
-
-void rsdIntrinsicBlurHFU1_K(void *dst,
-                          const void *pin, const void *gptr,
-                          int rct, int x1, int x2) {
-    const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
-    const float *pi;
-    __m128 pf, g0, g1, g2, g3, gx, p0, p1;
-    __m128i o;
-    int r;
-
-    for (; x1 < x2; x1+=4) {
-        g0 = _mm_load_ss((const float *)gptr);
-        g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
-
-        pi = (const float *)pin + x1;
-        pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
-
-        for (r = 1; r < rct; r += 4) {
-            gx = _mm_loadu_ps((const float *)gptr + r);
-            p0 = _mm_loadu_ps(pi + r);
-            p1 = _mm_loadu_ps(pi + r + 4);
-
-            g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
-            pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
-            g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
-            pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
-            g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
-            pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
-            g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
-            pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
-        }
-
-        o = _mm_cvtps_epi32(pf);
-        *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
-        dst = (char *)dst + 4;
-    }
-}
-
-void rsdIntrinsicYuv_K(void *dst,
-                       const unsigned char *pY, const unsigned char *pUV,
-                       uint32_t count, const short *param) {
-    __m128i biasY, biasUV;
-    __m128i c0, c1, c2, c3, c4;
-
-    biasY = _mm_set1_epi32(param[8]);   /*  16 */
-    biasUV = _mm_set1_epi32(param[16]); /* 128 */
-
-    c0 = _mm_set1_epi32(param[0]);  /*  298 */
-    c1 = _mm_set1_epi32(param[1]);  /*  409 */
-    c2 = _mm_set1_epi32(param[2]);  /* -100 */
-    c3 = _mm_set1_epi32(param[3]);  /*  516 */
-    c4 = _mm_set1_epi32(param[4]);  /* -208 */
-
-    __m128i Y, UV, U, V, R, G, B, A;
-
-    A = _mm_set1_epi32(255);
-    uint32_t i;
-
-    for (i = 0; i < (count << 1); ++i) {
-        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
-        UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
-
-        Y = _mm_sub_epi32(Y, biasY);
-        UV = _mm_sub_epi32(UV, biasUV);
-
-        U = _mm_shuffle_epi32(UV, 0xf5);
-        V = _mm_shuffle_epi32(UV, 0xa0);
-
-        Y = mullo_epi32(Y, c0);
-
-        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
-        R = _mm_add_epi32(R, biasUV);
-        R = _mm_srai_epi32(R, 8);
-
-        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
-        G = _mm_add_epi32(G, mullo_epi32(V, c4));
-        G = _mm_add_epi32(G, biasUV);
-        G = _mm_srai_epi32(G, 8);
-
-        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
-        B = _mm_add_epi32(B, biasUV);
-        B = _mm_srai_epi32(B, 8);
-
-        __m128i y1, y2, y3, y4;
-
-        y1 = packus_epi32(R, G);
-        y2 = packus_epi32(B, A);
-        y3 = _mm_packus_epi16(y1, y2);
-        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
-                                          14, 10, 6, 2,
-                                          13,  9, 5, 1,
-                                          12,  8, 4, 0);
-        y4 = _mm_shuffle_epi8(y3, T4x4);
-        _mm_storeu_si128((__m128i *)dst, y4);
-        pY += 4;
-        pUV += 4;
-        dst = (__m128i *)dst + 1;
-    }
-}
-
-void rsdIntrinsicYuvR_K(void *dst,
-                       const unsigned char *pY, const unsigned char *pUV,
-                       uint32_t count, const short *param) {
-    __m128i biasY, biasUV;
-    __m128i c0, c1, c2, c3, c4;
-
-    biasY = _mm_set1_epi32(param[8]);   /*  16 */
-    biasUV = _mm_set1_epi32(param[16]); /* 128 */
-
-    c0 = _mm_set1_epi32(param[0]);  /*  298 */
-    c1 = _mm_set1_epi32(param[1]);  /*  409 */
-    c2 = _mm_set1_epi32(param[2]);  /* -100 */
-    c3 = _mm_set1_epi32(param[3]);  /*  516 */
-    c4 = _mm_set1_epi32(param[4]);  /* -208 */
-
-    __m128i Y, UV, U, V, R, G, B, A;
-
-    A = _mm_set1_epi32(255);
-    uint32_t i;
-
-    for (i = 0; i < (count << 1); ++i) {
-        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
-        UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
-
-        Y = _mm_sub_epi32(Y, biasY);
-        UV = _mm_sub_epi32(UV, biasUV);
-
-        V = _mm_shuffle_epi32(UV, 0xf5);
-        U = _mm_shuffle_epi32(UV, 0xa0);
-
-        Y = mullo_epi32(Y, c0);
-
-        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
-        R = _mm_add_epi32(R, biasUV);
-        R = _mm_srai_epi32(R, 8);
-
-        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
-        G = _mm_add_epi32(G, mullo_epi32(V, c4));
-        G = _mm_add_epi32(G, biasUV);
-        G = _mm_srai_epi32(G, 8);
-
-        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
-        B = _mm_add_epi32(B, biasUV);
-        B = _mm_srai_epi32(B, 8);
-
-        __m128i y1, y2, y3, y4;
-
-        y1 = packus_epi32(R, G);
-        y2 = packus_epi32(B, A);
-        y3 = _mm_packus_epi16(y1, y2);
-        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
-                                          14, 10, 6, 2,
-                                          13,  9, 5, 1,
-                                          12,  8, 4, 0);
-        y4 = _mm_shuffle_epi8(y3, T4x4);
-        _mm_storeu_si128((__m128i *)dst, y4);
-        pY += 4;
-        pUV += 4;
-        dst = (__m128i *)dst + 1;
-    }
-}
-
-void rsdIntrinsicYuv2_K(void *dst,
-                       const unsigned char *pY, const unsigned char *pU,
-                       const unsigned char *pV, uint32_t count, const short *param) {
-    __m128i biasY, biasUV;
-    __m128i c0, c1, c2, c3, c4;
-
-    biasY = _mm_set1_epi32(param[8]);   /*  16 */
-    biasUV = _mm_set1_epi32(param[16]); /* 128 */
-
-    c0 = _mm_set1_epi32(param[0]);  /*  298 */
-    c1 = _mm_set1_epi32(param[1]);  /*  409 */
-    c2 = _mm_set1_epi32(param[2]);  /* -100 */
-    c3 = _mm_set1_epi32(param[3]);  /*  516 */
-    c4 = _mm_set1_epi32(param[4]);  /* -208 */
-
-    __m128i Y, U, V, R, G, B, A;
-
-    A = _mm_set1_epi32(255);
-    uint32_t i;
-
-    for (i = 0; i < (count << 1); ++i) {
-        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
-        U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
-		V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
-
-        Y = _mm_sub_epi32(Y, biasY);
-        U = _mm_sub_epi32(U, biasUV);
-		V = _mm_sub_epi32(V, biasUV);
-
-        Y = mullo_epi32(Y, c0);
-
-        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
-        R = _mm_add_epi32(R, biasUV);
-        R = _mm_srai_epi32(R, 8);
-
-        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
-        G = _mm_add_epi32(G, mullo_epi32(V, c4));
-        G = _mm_add_epi32(G, biasUV);
-        G = _mm_srai_epi32(G, 8);
-
-        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
-        B = _mm_add_epi32(B, biasUV);
-        B = _mm_srai_epi32(B, 8);
-
-        __m128i y1, y2, y3, y4;
-
-        y1 = packus_epi32(R, G);
-        y2 = packus_epi32(B, A);
-        y3 = _mm_packus_epi16(y1, y2);
-        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
-                                          14, 10, 6, 2,
-                                          13,  9, 5, 1,
-                                          12,  8, 4, 0);
-        y4 = _mm_shuffle_epi8(y3, T4x4);
-        _mm_storeu_si128((__m128i *)dst, y4);
-        pY += 4;
-        pU += 4;
-		pV += 4;
-        dst = (__m128i *)dst + 1;
-    }
-}
-
-extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
-                                          const void *y1, const void *y2,
-                                          const void *y3, const void *y4,
-                                          const short *coef, uint32_t count) {
-    __m128i x;
-    __m128i c0, c2, c4, c6, c8, c10, c12;
-    __m128i c14, c16, c18, c20, c22, c24;
-    __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
-    __m128i p0,  p1,  p2,  p3,  p4,  p5,  p6,  p7;
-    __m128i p8,  p9, p10, p11, p12, p13, p14, p15;
-    __m128i p16, p17, p18, p19, p20, p21, p22, p23;
-    __m128i p24, p25, p26, p27, p28, p29, p30, p31;
-    __m128i p32, p33, p34, p35, p36, p37, p38, p39;
-    __m128i o0, o1, o2, o3;
-    uint32_t i;
-
-    x = _mm_loadl_epi64((const __m128i *)(coef+0));
-    c0  = _mm_shuffle_epi32(x, 0x00);
-    c2  = _mm_shuffle_epi32(x, 0x55);
-
-    x = _mm_loadl_epi64((const __m128i *)(coef+4));
-    c4  = _mm_shuffle_epi32(x, 0x00);
-    c6  = _mm_shuffle_epi32(x, 0x55);
-
-    x = _mm_loadl_epi64((const __m128i *)(coef+8));
-    c8  = _mm_shuffle_epi32(x, 0x00);
-    c10  = _mm_shuffle_epi32(x, 0x55);
-
-    x = _mm_loadl_epi64((const __m128i *)(coef+12));
-    c12  = _mm_shuffle_epi32(x, 0x00);
-    c14  = _mm_shuffle_epi32(x, 0x55);
-
-    x = _mm_loadl_epi64((const __m128i *)(coef+16));
-    c16  = _mm_shuffle_epi32(x, 0x00);
-    c18  = _mm_shuffle_epi32(x, 0x55);
-
-    x = _mm_loadl_epi64((const __m128i *)(coef+20));
-    c20  = _mm_shuffle_epi32(x, 0x00);
-    c22  = _mm_shuffle_epi32(x, 0x55);
-
-    x = _mm_loadl_epi64((const __m128i *)(coef+24));
-    c24  = _mm_shuffle_epi32(x, 0x00);
-
-    for (i = 0; i < count; ++i) {
-
-        p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
-        p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
-        p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
-        p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
-        p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
-        p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
-        p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
-        p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
-
-        p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
-        p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
-        p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
-        p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
-        p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
-        p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
-        p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
-        p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
-
-        p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
-        p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
-        p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
-        p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
-        p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
-        p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
-        p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
-        p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
-
-        p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
-        p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
-        p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
-        p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
-        p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
-        p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
-        p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
-        p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
-
-        p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
-        p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
-        p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
-        p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
-        p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
-        p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
-        p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
-        p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
-
-        o0 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1),  c0);
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3),  c2));
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8),  c4));
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10),  c6));
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c8));
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
-        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
-        o0 = _mm_srai_epi32(o0, 8);
-
-        o1 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2),  c0);
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c2));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9),  c4));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11),  c6));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13),  c8));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
-        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
-        o1 = _mm_srai_epi32(o1, 8);
-
-        o2 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3),  c0);
-        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5),  c2));
-        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10),  c4));
-        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c6));
-        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14),  c8));
-        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
-        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
-        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
-        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
-        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
-        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
-        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
-        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
-        o2 = _mm_srai_epi32(o2, 8);
-
-        o3 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c0);
-        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6),  c2));
-        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11),  c4));
-        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13),  c6));
-        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15),  c8));
-        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
-        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
-        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
-        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
-        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
-        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
-        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
-        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
-        o3 = _mm_srai_epi32(o3, 8);
-
-        o0 = packus_epi32(o0, o1);
-        o2 = packus_epi32(o2, o3);
-        o0 = _mm_packus_epi16(o0, o2);
-        _mm_storeu_si128((__m128i *)dst, o0);
-
-        y0 = (const char *)y0 + 16;
-        y1 = (const char *)y1 + 16;
-        y2 = (const char *)y2 + 16;
-        y3 = (const char *)y3 + 16;
-        y4 = (const char *)y4 + 16;
-        dst = (char *)dst + 16;
-    }
-}
-
-void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
-    __m128i all1s, ina, ins;
-    __m128i in0, in1, out0, out1;
-    __m128i t0, t1, t2, t3;
-    uint32_t i;
-
-    all1s = _mm_set1_epi16(255);
-
-    for (i = 0; i < count8; ++i) {
-        in0 = _mm_loadu_si128((const __m128i *)src);
-        in1 = _mm_loadu_si128((const __m128i *)src + 1);
-        out0 = _mm_loadu_si128((const __m128i *)dst);
-        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
-
-        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ins, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
-        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
-        t0 = _mm_srli_epi16(t0, 8);
-        t0 = _mm_add_epi16(t0, ins);
-
-        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ins, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
-        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
-        t1 = _mm_srli_epi16(t1, 8);
-        t1 = _mm_add_epi16(t1, ins);
-
-        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ins, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
-        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
-        t2 = _mm_srli_epi16(t2, 8);
-        t2 = _mm_add_epi16(t2, ins);
-
-        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ins, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
-        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
-        t3 = _mm_srli_epi16(t3, 8);
-        t3 = _mm_add_epi16(t3, ins);
-
-        t0 = _mm_packus_epi16(t0, t1);
-        t2 = _mm_packus_epi16(t2, t3);
-        _mm_storeu_si128((__m128i *)dst, t0);
-        _mm_storeu_si128((__m128i *)dst + 1, t2);
-
-        src = (const __m128i *)src + 2;
-        dst = (__m128i *)dst + 2;
-    }
-}
-
-void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
-    __m128i all1s, outa, outs;
-    __m128i in0, in1, out0, out1;
-    __m128i t0, t1, t2, t3;
-    uint32_t i;
-
-    all1s = _mm_set1_epi16(255);
-
-    for (i = 0; i < count8; ++i) {
-        in0 = _mm_loadu_si128((const __m128i *)src);
-        in1 = _mm_loadu_si128((const __m128i *)src + 1);
-        out0 = _mm_loadu_si128((const __m128i *)dst);
-        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
-
-
-        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outs, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
-        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
-        t0 = _mm_srli_epi16(t0, 8);
-        t0 = _mm_add_epi16(t0, outs);
-
-        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outs, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
-        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
-        t1 = _mm_srli_epi16(t1, 8);
-        t1 = _mm_add_epi16(t1, outs);
-
-        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outs, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
-        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
-        t2 = _mm_srli_epi16(t2, 8);
-        t2 = _mm_add_epi16(t2, outs);
-
-        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outs, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
-        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
-        t3 = _mm_srli_epi16(t3, 8);
-        t3 = _mm_add_epi16(t3, outs);
-
-        t0 = _mm_packus_epi16(t0, t1);
-        t2 = _mm_packus_epi16(t2, t3);
-        _mm_storeu_si128((__m128i *)dst, t0);
-        _mm_storeu_si128((__m128i *)dst + 1, t2);
-
-        src = (const __m128i *)src + 2;
-        dst = (__m128i *)dst + 2;
-    }
-}
-
-void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
-    __m128i outa;
-    __m128i in0, in1, out0, out1;
-    __m128i t0, t1, t2, t3;
-    uint32_t i;
-
-    for (i = 0; i < count8; ++i) {
-        in0 = _mm_loadu_si128((const __m128i *)src);
-        in1 = _mm_loadu_si128((const __m128i *)src + 1);
-        out0 = _mm_loadu_si128((const __m128i *)dst);
-        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
-
-        outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outa, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
-        t0 = _mm_mullo_epi16(t0, outa);
-        t0 = _mm_srli_epi16(t0, 8);
-
-        outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outa, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
-        t1 = _mm_mullo_epi16(t1, outa);
-        t1 = _mm_srli_epi16(t1, 8);
-
-        outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outa, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
-        t2 = _mm_mullo_epi16(t2, outa);
-        t2 = _mm_srli_epi16(t2, 8);
-
-        outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outa, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
-        t3 = _mm_mullo_epi16(t3, outa);
-        t3 = _mm_srli_epi16(t3, 8);
-
-        t0 = _mm_packus_epi16(t0, t1);
-        t2 = _mm_packus_epi16(t2, t3);
-        _mm_storeu_si128((__m128i *)dst, t0);
-        _mm_storeu_si128((__m128i *)dst + 1, t2);
-
-        src = (const __m128i *)src + 2;
-        dst = (__m128i *)dst + 2;
-    }
-}
-
-void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
-    __m128i ina;
-    __m128i in0, in1, out0, out1;
-    __m128i t0, t1, t2, t3;
-    uint32_t i;
-
-    for (i = 0; i < count8; ++i) {
-        in0 = _mm_loadu_si128((const __m128i *)src);
-        in1 = _mm_loadu_si128((const __m128i *)src + 1);
-        out0 = _mm_loadu_si128((const __m128i *)dst);
-        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
-
-        ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ina, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
-        t0 = _mm_mullo_epi16(t0, ina);
-        t0 = _mm_srli_epi16(t0, 8);
-
-        ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ina, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
-        t1 = _mm_mullo_epi16(t1, ina);
-        t1 = _mm_srli_epi16(t1, 8);
-
-        ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ina, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
-        t2 = _mm_mullo_epi16(t2, ina);
-        t2 = _mm_srli_epi16(t2, 8);
-
-        ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ina, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
-        t3 = _mm_mullo_epi16(t3, ina);
-        t3 = _mm_srli_epi16(t3, 8);
-
-        t0 = _mm_packus_epi16(t0, t1);
-        t2 = _mm_packus_epi16(t2, t3);
-        _mm_storeu_si128((__m128i *)dst, t0);
-        _mm_storeu_si128((__m128i *)dst + 1, t2);
-
-        src = (const __m128i *)src + 2;
-        dst = (__m128i *)dst + 2;
-    }
-}
-
-void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
-    __m128i all1s, outa;
-    __m128i in0, in1, out0, out1;
-    __m128i t0, t1, t2, t3;
-    uint32_t i;
-
-    all1s = _mm_set1_epi16(255);
-
-    for (i = 0; i < count8; ++i) {
-        in0 = _mm_loadu_si128((const __m128i *)src);
-        in1 = _mm_loadu_si128((const __m128i *)src + 1);
-        out0 = _mm_loadu_si128((const __m128i *)dst);
-        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
-
-        outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outa, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
-        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
-        t0 = _mm_srli_epi16(t0, 8);
-
-        outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outa, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
-        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
-        t1 = _mm_srli_epi16(t1, 8);
-
-        outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outa, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
-        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
-        t2 = _mm_srli_epi16(t2, 8);
-
-        outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outa, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
-        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
-        t3 = _mm_srli_epi16(t3, 8);
-
-        t0 = _mm_packus_epi16(t0, t1);
-        t2 = _mm_packus_epi16(t2, t3);
-        _mm_storeu_si128((__m128i *)dst, t0);
-        _mm_storeu_si128((__m128i *)dst + 1, t2);
-
-        src = (const __m128i *)src + 2;
-        dst = (__m128i *)dst + 2;
-    }
-}
-
-void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
-    __m128i all1s, ina;
-    __m128i in0, in1, out0, out1;
-    __m128i t0, t1, t2, t3;
-    uint32_t i;
-
-    all1s = _mm_set1_epi16(255);
-
-    for (i = 0; i < count8; ++i) {
-        in0 = _mm_loadu_si128((const __m128i *)src);
-        in1 = _mm_loadu_si128((const __m128i *)src + 1);
-        out0 = _mm_loadu_si128((const __m128i *)dst);
-        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
-
-        ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ina, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
-        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
-        t0 = _mm_srli_epi16(t0, 8);
-
-        ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ina, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
-        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
-        t1 = _mm_srli_epi16(t1, 8);
-
-        ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ina, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
-        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
-        t2 = _mm_srli_epi16(t2, 8);
-
-        ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ina, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
-        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
-        t3 = _mm_srli_epi16(t3, 8);
-
-        t0 = _mm_packus_epi16(t0, t1);
-        t2 = _mm_packus_epi16(t2, t3);
-        _mm_storeu_si128((__m128i *)dst, t0);
-        _mm_storeu_si128((__m128i *)dst + 1, t2);
-
-        src = (const __m128i *)src + 2;
-        dst = (__m128i *)dst + 2;
-    }
-}
-
-void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
-    const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
-    __m128i all1s, ina, outa, ins, outs;
-    __m128i in0, in1, out0, out1;
-    __m128i t0, t1, t2, t3;
-    uint32_t i;
-
-    all1s = _mm_set1_epi16(255);
-
-    for (i = 0; i < count8; ++i) {
-        in0 = _mm_loadu_si128((const __m128i *)src);
-        in1 = _mm_loadu_si128((const __m128i *)src + 1);
-        out0 = _mm_loadu_si128((const __m128i *)dst);
-        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
-
-        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ins, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outs, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t0 = _mm_sub_epi16(all1s, ina);
-        t0 = _mm_mullo_epi16(t0, outs);
-        t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
-        t0 = _mm_srli_epi16(t0, 8);
-
-        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ins, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outs, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t1 = _mm_sub_epi16(all1s, ina);
-        t1 = _mm_mullo_epi16(t1, outs);
-        t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
-        t1 = _mm_srli_epi16(t1, 8);
-
-        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ins, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outs, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t2 = _mm_sub_epi16(all1s, ina);
-        t2 = _mm_mullo_epi16(t2, outs);
-        t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
-        t2 = _mm_srli_epi16(t2, 8);
-
-        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ins, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outs, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t3 = _mm_sub_epi16(all1s, ina);
-        t3 = _mm_mullo_epi16(t3, outs);
-        t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
-        t3 = _mm_srli_epi16(t3, 8);
-
-        t0 = _mm_packus_epi16(t0, t1);
-        t0 = blendv_epi8(t0, out0, M0001);
-        t2 = _mm_packus_epi16(t2, t3);
-        t2 = blendv_epi8(t2, out1, M0001);
-        _mm_storeu_si128((__m128i *)dst, t0);
-        _mm_storeu_si128((__m128i *)dst + 1, t2);
-
-        src = (const __m128i *)src + 2;
-        dst = (__m128i *)dst + 2;
-    }
-}
-
-void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
-    const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
-    __m128i all1s, ina, ins, outa, outs;
-    __m128i in0, in1, out0, out1;
-    __m128i t0, t1, t2, t3;
-    uint32_t i;
-
-    all1s = _mm_set1_epi16(255);
-
-    for (i = 0; i < count8; ++i) {
-        in0 = _mm_loadu_si128((const __m128i *)src);
-        in1 = _mm_loadu_si128((const __m128i *)src + 1);
-        out0 = _mm_loadu_si128((const __m128i *)dst);
-        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
-
-        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ins, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outs, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t0 = _mm_sub_epi16(all1s, outa);
-        t0 = _mm_mullo_epi16(t0, ins);
-        t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
-        t0 = _mm_srli_epi16(t0, 8);
-
-        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ins, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outs, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t1 = _mm_sub_epi16(all1s, outa);
-        t1 = _mm_mullo_epi16(t1, ins);
-        t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
-        t1 = _mm_srli_epi16(t1, 8);
-
-        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ins, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outs, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t2 = _mm_sub_epi16(all1s, outa);
-        t2 = _mm_mullo_epi16(t2, ins);
-        t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
-        t2 = _mm_srli_epi16(t2, 8);
-
-        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
-        ina = _mm_shufflelo_epi16(ins, 0xFF);
-        ina = _mm_shufflehi_epi16(ina, 0xFF);
-        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
-        outa = _mm_shufflelo_epi16(outs, 0xFF);
-        outa = _mm_shufflehi_epi16(outa, 0xFF);
-        t3 = _mm_sub_epi16(all1s, outa);
-        t3 = _mm_mullo_epi16(t3, ins);
-        t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
-        t3 = _mm_srli_epi16(t3, 8);
-
-        t0 = _mm_packus_epi16(t0, t1);
-        t0 = blendv_epi8(t0, in0, M0001);
-        t2 = _mm_packus_epi16(t2, t3);
-        t2 = blendv_epi8(t2, in1, M0001);
-        _mm_storeu_si128((__m128i *)dst, t0);
-        _mm_storeu_si128((__m128i *)dst + 1, t2);
-
-        src = (const __m128i *)src + 2;
-        dst = (__m128i *)dst + 2;
-    }
-}
-
-void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
-    __m128i in0, in1, out0, out1;
-    uint32_t i;
-
-    for (i = 0; i < count8; ++i) {
-        in0 = _mm_loadu_si128((const __m128i *)src);
-        in1 = _mm_loadu_si128((const __m128i *)src + 1);
-        out0 = _mm_loadu_si128((const __m128i *)dst);
-        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
-
-        out0 = _mm_xor_si128(out0, in0);
-        out1 = _mm_xor_si128(out1, in1);
-
-        _mm_storeu_si128((__m128i *)dst, out0);
-        _mm_storeu_si128((__m128i *)dst + 1, out1);
-
-        src = (const __m128i *)src + 2;
-        dst = (__m128i *)dst + 2;
-    }
-}
-
-void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
-    __m128i in0, in1, out0, out1;
-    __m128i t0, t1, t2, t3;
-    uint32_t i;
-
-    for (i = 0; i < count8; ++i) {
-        in0 = _mm_loadu_si128((const __m128i *)src);
-        in1 = _mm_loadu_si128((const __m128i *)src + 1);
-        out0 = _mm_loadu_si128((const __m128i *)dst);
-        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
-
-        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
-        t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
-        t0 = _mm_srli_epi16(t0, 8);
-
-        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
-        t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
-        t1 = _mm_srli_epi16(t1, 8);
-
-        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
-        t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
-        t2 = _mm_srli_epi16(t2, 8);
-
-        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
-        t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
-        t3 = _mm_srli_epi16(t3, 8);
-
-        t0 = _mm_packus_epi16(t0, t1);
-        t2 = _mm_packus_epi16(t2, t3);
-        _mm_storeu_si128((__m128i *)dst, t0);
-        _mm_storeu_si128((__m128i *)dst + 1, t2);
-
-        src = (const __m128i *)src + 2;
-        dst = (__m128i *)dst + 2;
-    }
-}
-
-void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
-    __m128i in0, in1, out0, out1;
-    uint32_t i;
-
-    for (i = 0; i < count8; ++i) {
-        in0 = _mm_loadu_si128((const __m128i *)src);
-        in1 = _mm_loadu_si128((const __m128i *)src + 1);
-        out0 = _mm_loadu_si128((const __m128i *)dst);
-        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
-
-        out0 = _mm_adds_epu8(out0, in0);
-        out1 = _mm_adds_epu8(out1, in1);
-
-        _mm_storeu_si128((__m128i *)dst, out0);
-        _mm_storeu_si128((__m128i *)dst + 1, out1);
-
-        src = (const __m128i *)src + 2;
-        dst = (__m128i *)dst + 2;
-    }
-}
-
-void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
-    __m128i in0, in1, out0, out1;
-    uint32_t i;
-
-    for (i = 0; i < count8; ++i) {
-        in0 = _mm_loadu_si128((const __m128i *)src);
-        in1 = _mm_loadu_si128((const __m128i *)src + 1);
-        out0 = _mm_loadu_si128((const __m128i *)dst);
-        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
-
-        out0 = _mm_subs_epu8(out0, in0);
-        out1 = _mm_subs_epu8(out1, in1);
-
-        _mm_storeu_si128((__m128i *)dst, out0);
-        _mm_storeu_si128((__m128i *)dst + 1, out1);
-
-        src = (const __m128i *)src + 2;
-        dst = (__m128i *)dst + 2;
-    }
-}
-
-}  // namespace android
-}  // namespace renderscript
commit	e23e8bda0c147c900c336366117b8d28a28a7fee	[log] [tgz]
author	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	Wed Jun 15 21:48:09 2022 +0000
committer	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	Wed Jun 15 21:48:09 2022 +0000
tree	c0726a51cec8ede53aa7796a05d2b75480ce0cd0
parent	9c7eeb2dc2c0ea2af889c3dccdb8e9c4b4a7aeb3 [diff]
parent	16f96a2cb9f5d2764be7d8fad4eac6ae7167cd92 [diff]