Snap for 11383711 from 51e5da82a7846a8052db319076ad245cca59adfb to mainline-cellbroadcast-release

Change-Id: I61f2f75f11a8acdb1036ef9cc328151f36d9b6ea
diff --git a/src/internal.h b/src/internal.h
index b1a94f8..631c5a8 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -142,7 +142,7 @@
         atomic_uint reset_task_cur;
         atomic_int cond_signaled;
         struct {
-            int exec;
+            int exec, finished;
             pthread_cond_t cond;
             const Dav1dPicture *in;
             Dav1dPicture *out;
diff --git a/src/lib.c b/src/lib.c
index 7b82309..3807efd 100644
--- a/src/lib.c
+++ b/src/lib.c
@@ -50,9 +50,6 @@
 #include "src/thread_task.h"
 #include "src/wedge.h"
 
-// TODO(b/315538557): Temporarily disable multi-threading in film grain
-#define DISABLE_FG_MT 1
-
 static COLD void init_internal(void) {
     dav1d_init_cpu();
     dav1d_init_ii_wedge_masks();
@@ -499,7 +496,7 @@
     int res = dav1d_picture_alloc_copy(c, out, in->p.w, in);
     if (res < 0) goto error;
 
-    if (c->n_tc > 1 && !DISABLE_FG_MT) {
+    if (c->n_tc > 1) {
         dav1d_task_delayed_fg(c, out, in);
     } else {
         switch (out->p.bpc) {
diff --git a/src/thread_task.c b/src/thread_task.c
index 1698ab0..1ededde 100644
--- a/src/thread_task.c
+++ b/src/thread_task.c
@@ -357,8 +357,11 @@
     atomic_init(&ttd->delayed_fg.progress[1], 0);
     pthread_mutex_lock(&ttd->lock);
     ttd->delayed_fg.exec = 1;
+    ttd->delayed_fg.finished = 0;
     pthread_cond_signal(&ttd->cond);
-    pthread_cond_wait(&ttd->delayed_fg.cond, &ttd->lock);
+    do {
+        pthread_cond_wait(&ttd->delayed_fg.cond, &ttd->lock);
+    } while (!ttd->delayed_fg.finished);
     pthread_mutex_unlock(&ttd->lock);
 }
 
@@ -501,45 +504,44 @@
         int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
         pthread_mutex_unlock(&ttd->lock);
         int progmax = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
-    fg_apply_loop:
-        if (row + 1 < progmax)
-            pthread_cond_signal(&ttd->cond);
-        else if (row + 1 >= progmax) {
-            pthread_mutex_lock(&ttd->lock);
-            ttd->delayed_fg.exec = 0;
-            if (row >= progmax) goto end_add;
-            pthread_mutex_unlock(&ttd->lock);
-        }
-        switch (out->p.bpc) {
+        while (row < progmax) {
+            if (row + 1 < progmax)
+                pthread_cond_signal(&ttd->cond);
+            else {
+                pthread_mutex_lock(&ttd->lock);
+                ttd->delayed_fg.exec = 0;
+                pthread_mutex_unlock(&ttd->lock);
+            }
+            switch (out->p.bpc) {
 #if CONFIG_8BPC
-        case 8:
-            dav1d_apply_grain_row_8bpc(&c->dsp[0].fg, out, in,
-                                       ttd->delayed_fg.scaling_8bpc,
-                                       ttd->delayed_fg.grain_lut_8bpc, row);
-            break;
+            case 8:
+                dav1d_apply_grain_row_8bpc(&c->dsp[0].fg, out, in,
+                                           ttd->delayed_fg.scaling_8bpc,
+                                           ttd->delayed_fg.grain_lut_8bpc, row);
+                break;
 #endif
 #if CONFIG_16BPC
-        case 10:
-        case 12:
-            dav1d_apply_grain_row_16bpc(&c->dsp[off].fg, out, in,
-                                        ttd->delayed_fg.scaling_16bpc,
-                                        ttd->delayed_fg.grain_lut_16bpc, row);
-            break;
+            case 10:
+            case 12:
+                dav1d_apply_grain_row_16bpc(&c->dsp[off].fg, out, in,
+                                            ttd->delayed_fg.scaling_16bpc,
+                                            ttd->delayed_fg.grain_lut_16bpc, row);
+                break;
 #endif
-        default: abort();
+            default: abort();
+            }
+            row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
+            atomic_fetch_add(&ttd->delayed_fg.progress[1], 1);
         }
-        row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
-        int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1;
-        if (row < progmax) goto fg_apply_loop;
         pthread_mutex_lock(&ttd->lock);
         ttd->delayed_fg.exec = 0;
-    end_add:
-        done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1;
+        int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1;
         progmax = atomic_load(&ttd->delayed_fg.progress[0]);
         // signal for completion only once the last runner reaches this
-        if (done < progmax)
-            break;
-        pthread_cond_signal(&ttd->delayed_fg.cond);
+        if (done >= progmax) {
+            ttd->delayed_fg.finished = 1;
+            pthread_cond_signal(&ttd->delayed_fg.cond);
+        }
         break;
     default: abort();
     }