/*
 * This file is part of libplacebo.
 *
 * libplacebo is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * libplacebo is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
 */

#include <math.h>
#include "shaders.h"

#include <libplacebo/colorspace.h>
#include <libplacebo/shaders/sampling.h>

const struct pl_deband_params pl_deband_default_params = { PL_DEBAND_DEFAULTS };

static inline struct pl_tex_params src_params(const struct pl_sample_src *src)
{
    if (src->tex)
        return src->tex->params;

    return (struct pl_tex_params) {
        .w = src->tex_w,
        .h = src->tex_h,
    };
}

enum filter {
    NEAREST = PL_TEX_SAMPLE_NEAREST,
    LINEAR  = PL_TEX_SAMPLE_LINEAR,
    BEST,
    FASTEST,
};

// Helper function to compute the src/dst sizes and upscaling ratios
static bool setup_src(pl_shader sh, const struct pl_sample_src *src,
                      ident_t *src_tex, ident_t *pos, ident_t *pt,
                      float *ratio_x, float *ratio_y, uint8_t *comp_mask,
                      float *scale, bool resizeable,
                      enum filter filter)
{
    enum pl_shader_sig sig;
    float src_w, src_h;
    enum pl_tex_sample_mode sample_mode;
    if (src->tex) {
        pl_fmt fmt = src->tex->params.format;
        bool can_linear = fmt->caps & PL_FMT_CAP_LINEAR;
        pl_assert(pl_tex_params_dimension(src->tex->params) == 2);
        sig = PL_SHADER_SIG_NONE;
        src_w = pl_rect_w(src->rect);
        src_h = pl_rect_h(src->rect);
        switch (filter) {
        case FASTEST:
        case NEAREST:
            sample_mode = PL_TEX_SAMPLE_NEAREST;
            break;
        case LINEAR:
            if (!can_linear) {
                SH_FAIL(sh, "Trying to use a shader that requires linear "
                        "sampling with a texture whose format (%s) does not "
                        "support PL_FMT_CAP_LINEAR", fmt->name);
                return false;
            }
            sample_mode = PL_TEX_SAMPLE_LINEAR;
            break;
        case BEST:
            sample_mode = can_linear ? PL_TEX_SAMPLE_LINEAR : PL_TEX_SAMPLE_NEAREST;
            break;
        }
    } else {
        pl_assert(src->tex_w && src->tex_h);
        sig = PL_SHADER_SIG_SAMPLER;
        src_w = src->sampled_w;
        src_h = src->sampled_h;
        if (filter == BEST || filter == FASTEST) {
            sample_mode = src->mode;
        } else {
            sample_mode = (enum pl_tex_sample_mode) filter;
            if (sample_mode != src->mode) {
                SH_FAIL(sh, "Trying to use a shader that requires a different "
                        "filter mode than the external sampler.");
                return false;
            }
        }
    }

    src_w = PL_DEF(src_w, src_params(src).w);
    src_h = PL_DEF(src_h, src_params(src).h);
    pl_assert(src_w && src_h);

    int out_w = PL_DEF(src->new_w, roundf(fabs(src_w)));
    int out_h = PL_DEF(src->new_h, roundf(fabs(src_h)));
    pl_assert(out_w && out_h);

    if (ratio_x)
        *ratio_x = out_w / fabs(src_w);
    if (ratio_y)
        *ratio_y = out_h / fabs(src_h);
    if (scale)
        *scale = PL_DEF(src->scale, 1.0);

    if (comp_mask) {
        uint8_t tex_mask = 0x0Fu;
        if (src->tex) {
            // Mask containing only the number of components in the texture
            tex_mask = (1 << src->tex->params.format->num_components) - 1;
        }

        uint8_t src_mask = src->component_mask;
        if (!src_mask)
            src_mask = (1 << PL_DEF(src->components, 4)) - 1;

        // Only actually sample components that are both requested and
        // available in the texture being sampled
        *comp_mask = tex_mask & src_mask;
    }

    if (resizeable)
        out_w = out_h = 0;
    if (!sh_require(sh, sig, out_w, out_h))
        return false;

    if (src->tex) {
        pl_rect2df rect = {
            .x0 = src->rect.x0,
            .y0 = src->rect.y0,
            .x1 = src->rect.x0 + src_w,
            .y1 = src->rect.y0 + src_h,
        };

        *src_tex = sh_bind(sh, src->tex, src->address_mode, sample_mode,
                           "src_tex", &rect, pos, pt);
    } else {
        if (pt) {
            float sx = 1.0 / src->tex_w, sy = 1.0 / src->tex_h;
            if (src->sampler == PL_SAMPLER_RECT)
                sx = sy = 1.0;

            *pt = sh_var(sh, (struct pl_shader_var) {
                .var = pl_var_vec2("tex_pt"),
                .data = &(float[2]) { sx, sy },
            });
        }

        sh->sampler_type = src->sampler;

        pl_assert(src->format);
        switch (src->format) {
        case PL_FMT_UNKNOWN:
        case PL_FMT_FLOAT:
        case PL_FMT_UNORM:
        case PL_FMT_SNORM: sh->sampler_prefix = ' '; break;
        case PL_FMT_UINT: sh->sampler_prefix = 'u'; break;
        case PL_FMT_SINT: sh->sampler_prefix = 's'; break;
        case PL_FMT_TYPE_COUNT:
            pl_unreachable();
        }

        *src_tex = sh_fresh(sh, "src_tex");
        *pos     = sh_fresh(sh, "pos");

        GLSLH("#define "$" src_tex  \n"
              "#define "$" pos      \n",
              *src_tex, *pos);
    }

    return true;
}

void pl_shader_deband(pl_shader sh, const struct pl_sample_src *src,
                      const struct pl_deband_params *params)
{
    float scale;
    ident_t tex, pos, pt;
    uint8_t mask;
    if (!setup_src(sh, src, &tex, &pos, &pt, NULL, NULL, &mask, &scale, false, NEAREST))
        return;

    params = PL_DEF(params, &pl_deband_default_params);
    sh_describe(sh, "debanding");
    GLSL("vec4 color;                       \n"
         "// pl_shader_deband               \n"
         "{                                 \n"
         "vec2 pos = "$", pt = "$";         \n"
         "color = textureLod("$", pos, 0.0);\n",
         pos, pt, tex);

    mask &= ~0x8u; // ignore alpha channel
    uint8_t num_comps = sh_num_comps(mask);
    const char *swiz = sh_swizzle(mask);
    pl_assert(num_comps <= 3);
    if (!num_comps) {
        GLSL("color *= "$"; \n"
             "}             \n",
             SH_FLOAT(scale));
        return;
    }

    GLSL("#define GET(X, Y)                                   \\\n"
         "    (textureLod("$", pos + pt * vec2(X, Y), 0.0).%s)  \n"
         "#define T %s                                          \n",
         tex, swiz, sh_float_type(mask));

    ident_t prng = sh_prng(sh, true, NULL);
    GLSL("T avg, diff, bound;   \n"
         "T res = color.%s;     \n"
         "vec2 d;               \n",
         swiz);

    if (params->iterations > 0) {
        ident_t radius = sh_const_float(sh, "radius", params->radius);
        ident_t threshold = sh_const_float(sh, "threshold",
                                           params->threshold / (1000 * scale));

        // For each iteration, compute the average at a given distance and
        // pick it instead of the color if the difference is below the threshold.
        for (int i = 1; i <= params->iterations; i++) {
            GLSL(// Compute a random angle and distance
                 "d = "$".xy * vec2(%d.0 * "$", %f);    \n"
                 "d = d.x * vec2(cos(d.y), sin(d.y));   \n"
                 // Sample at quarter-turn intervals around the source pixel
                 "avg = T(0.0);                         \n"
                 "avg += GET(+d.x, +d.y);               \n"
                 "avg += GET(-d.x, +d.y);               \n"
                 "avg += GET(-d.x, -d.y);               \n"
                 "avg += GET(+d.x, -d.y);               \n"
                 "avg *= 0.25;                          \n"
                 // Compare the (normalized) average against the pixel
                 "diff = abs(res - avg);                \n"
                 "bound = T("$" / %d.0);                \n",
                 prng, i, radius, M_PI * 2,
                 threshold, i);

            if (num_comps > 1) {
                GLSL("res = mix(avg, res, greaterThan(diff, bound)); \n");
            } else {
                GLSL("res = mix(avg, res, diff > bound); \n");
            }
        }
    }

    // Add some random noise to smooth out residual differences
    if (params->grain > 0) {
        // Avoid adding grain near true black
        GLSL("bound = T(\n");
        for (int c = 0; c < num_comps; c++) {
            GLSL("%c"$, c > 0 ? ',' : ' ',
                 SH_FLOAT(params->grain_neutral[c] / scale));
        }
        GLSL(");                                        \n"
             "T strength = min(abs(res - bound), "$");  \n"
             "res += strength * (T("$") - T(0.5));      \n",
             SH_FLOAT(params->grain / (1000.0 * scale)), prng);
    }

    GLSL("color.%s = res;   \n"
         "color *= "$";     \n"
         "#undef T          \n"
         "#undef GET        \n"
         "}                 \n",
         swiz, SH_FLOAT(scale));
}

bool pl_shader_sample_direct(pl_shader sh, const struct pl_sample_src *src)
{
    float scale;
    ident_t tex, pos;
    if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, BEST))
        return false;

    GLSL("// pl_shader_sample_direct                            \n"
         "vec4 color = vec4("$") * textureLod("$", "$", 0.0);   \n",
         SH_FLOAT(scale), tex, pos);
    return true;
}

bool pl_shader_sample_nearest(pl_shader sh, const struct pl_sample_src *src)
{
    float scale;
    ident_t tex, pos;
    if (!setup_src(sh, src, &tex, &pos,  NULL, NULL, NULL, NULL, &scale, true, NEAREST))
        return false;

    sh_describe(sh, "nearest");
    GLSL("// pl_shader_sample_nearest                           \n"
         "vec4 color = vec4("$") * textureLod("$", "$", 0.0);   \n",
         SH_FLOAT(scale), tex, pos);
    return true;
}

bool pl_shader_sample_bilinear(pl_shader sh, const struct pl_sample_src *src)
{
    float scale;
    ident_t tex, pos;
    if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, LINEAR))
        return false;

    sh_describe(sh, "bilinear");
    GLSL("// pl_shader_sample_bilinear                          \n"
         "vec4 color = vec4("$") * textureLod("$", "$", 0.0);   \n",
         SH_FLOAT(scale), tex, pos);
    return true;
}

bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src)
{
    ident_t tex, pos, pt;
    float rx, ry, scale;
    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
        return false;

    if (rx < 1 || ry < 1) {
        PL_TRACE(sh, "Using fast bicubic sampling when downscaling. This "
                 "will most likely result in nasty aliasing!");
    }

    // Explanation of how bicubic scaling with only 4 texel fetches is done:
    //   http://www.mate.tue.nl/mate/pdfs/10318.pdf
    //   'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'

    sh_describe(sh, "bicubic");
{
    const struct __attribute__((__packed__)) {
    ident_t pos;
    ident_t tex;
    ident_t pt;
    ident_t scale;
} _glsl_331_args = {
#line 334
        .pos = pos,
#line 335
        .tex = tex,
#line 349
        .pt = pt,
#line 356
        .scale = sh_const_float(sh, "scale", scale),
    };
#line 331
    size_t _glsl_331_fn(void *, pl_str *, const uint8_t *);
    pl_str_builder_append(sh->buffers[SH_BUF_BODY], _glsl_331_fn,
                          &_glsl_331_args, sizeof(_glsl_331_args));
}
#line 358

    return true;
}

bool pl_shader_sample_hermite(pl_shader sh, const struct pl_sample_src *src)
{
    ident_t tex, pos, pt;
    float rx, ry, scale;
    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
        return false;

    if (rx < 1 || ry < 1) {
        PL_TRACE(sh, "Using fast hermite sampling when downscaling. This "
                 "will most likely result in nasty aliasing!");
    }

    sh_describe(sh, "hermite");
{
    const struct __attribute__((__packed__)) {
    ident_t pos;
    ident_t tex;
    ident_t pt;
    ident_t scale;
} _glsl_375_args = {
#line 378
        .pos = pos,
#line 379
        .tex = tex,
#line 381
        .pt = pt,
#line 382
        .scale = sh_const_float(sh, "scale", scale),
    };
#line 375
    size_t _glsl_375_fn(void *, pl_str *, const uint8_t *);
    pl_str_builder_append(sh->buffers[SH_BUF_BODY], _glsl_375_fn,
                          &_glsl_375_args, sizeof(_glsl_375_args));
}
#line 384

    return true;
}

bool pl_shader_sample_gaussian(pl_shader sh, const struct pl_sample_src *src)
{
    ident_t tex, pos, pt;
    float rx, ry, scale;
    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
        return false;

    if (rx < 1 || ry < 1) {
        PL_TRACE(sh, "Using fast gaussian sampling when downscaling. This "
                 "will most likely result in nasty aliasing!");
    }

    sh_describe(sh, "gaussian");
{
    const struct __attribute__((__packed__)) {
    ident_t pos;
    ident_t tex;
    ident_t pt;
    ident_t scale;
} _glsl_401_args = {
#line 404
        .pos = pos,
#line 405
        .tex = tex,
#line 419
        .pt = pt,
#line 426
        .scale = sh_const_float(sh, "scale", scale),
    };
#line 401
    size_t _glsl_401_fn(void *, pl_str *, const uint8_t *);
    pl_str_builder_append(sh->buffers[SH_BUF_BODY], _glsl_401_fn,
                          &_glsl_401_args, sizeof(_glsl_401_args));
}
#line 428

    return true;
}

bool pl_shader_sample_oversample(pl_shader sh, const struct pl_sample_src *src,
                                 float threshold)
{
    ident_t tex, pos, pt;
    float rx, ry, scale;
    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
        return false;

    threshold = PL_CLAMP(threshold, 0.0f, 0.5f);
    sh_describe(sh, "oversample");
{
    const struct __attribute__((__packed__)) {
    ident_t pos;
    ident_t tex;
    ident_t rx;
    ident_t ry;
    ident_t threshold;
    ident_t pt;
    ident_t scale;
    bool threshold_0;
} _glsl_442_args = {
#line 445
        .pos = pos,
#line 446
        .tex = tex,
#line 449
        .rx = sh_var_float(sh, "rx", rx, true),
#line 450
        .ry = sh_var_float(sh, "ry", ry, true),
#line 454
        .threshold = sh_const_float(sh, "threshold", threshold),
#line 462
        .pt = pt,
#line 463
        .scale = sh_const_float(sh, "scale", scale),
#line 453
        .threshold_0 = threshold > 0,
    };
#line 442
    size_t _glsl_442_fn(void *, pl_str *, const uint8_t *);
    pl_str_builder_append(sh->buffers[SH_BUF_BODY], _glsl_442_fn,
                          &_glsl_442_args, sizeof(_glsl_442_args));
}
#line 465

    return true;
}

static void describe_filter(pl_shader sh, const struct pl_filter_config *cfg,
                            const char *stage, float rx, float ry)
{
    const char *dir;
    if (rx > 1 && ry > 1) {
        dir = "up";
    } else if (rx < 1 && ry < 1) {
        dir = "down";
    } else if (rx == 1 && ry == 1) {
        dir = "noop";
    } else {
        dir = "ana";
    }

    if (cfg->name) {
        sh_describef(sh, "%s %sscaling (%s)", stage, dir, cfg->name);
    } else if (cfg->window) {
        sh_describef(sh, "%s %sscaling (%s+%s)", stage, dir,
                     PL_DEF(cfg->kernel->name, "unknown"),
                     PL_DEF(cfg->window->name, "unknown"));
    } else {
        sh_describef(sh, "%s %sscaling (%s)", stage, dir,
                     PL_DEF(cfg->kernel->name, "unknown"));
    }
}

// Subroutine for computing and adding an individual texel contribution
// If `in` is NULL, samples directly
// If `in` is set, takes the pixel from inX[idx] where X is the component,
// `in` is the given identifier, and `idx` must be defined by the caller
static void polar_sample(pl_shader sh, pl_filter filter,
                         ident_t tex, ident_t lut, ident_t radius,
                         int x, int y, uint8_t comp_mask, ident_t in,
                         bool use_ar, ident_t scale)
{
    // Since we can't know the subpixel position in advance, assume a
    // worst case scenario
    int yy = y > 0 ? y-1 : y;
    int xx = x > 0 ? x-1 : x;
    float dmin = sqrt(xx*xx + yy*yy);
    // Skip samples definitely outside the radius
    if (dmin >= filter->radius)
        return;

    // Check for samples that might be skippable
    bool maybe_skippable = dmin >= filter->radius - M_SQRT2;

    // Check for samples that definitely won't contribute to anti-ringing
    const float ar_radius = filter->radius_zero;
    use_ar &= dmin < ar_radius;

{
    const struct __attribute__((__packed__)) {
    int x;
    int y;
    float ar_radius;
    ident_t radius;
    ident_t lut;
    ident_t in;
    ident_t tex;
    ident_t scale;
    bool maybe_skippable;
    bool in_null_ident;
    uint8_t comp_mask;
    bool use_ar;
} _glsl_520_args = {
#line 521
        .x = x,
#line 521
        .y = y,
#line 536
        .ar_radius = ar_radius,
#line 524
        .radius = radius,
#line 525
        .lut = lut,
#line 529
        .in = in,
#line 531
        .tex = tex,
#line 538
        .scale = scale,
#line 523
        .maybe_skippable = maybe_skippable,
#line 527
        .in_null_ident = in != NULL_IDENT,
#line 528
        .comp_mask = comp_mask,
#line 535
        .use_ar = use_ar,
    };
#line 520
    size_t _glsl_520_fn(void *, pl_str *, const uint8_t *);
    pl_str_builder_append(sh->buffers[SH_BUF_BODY], _glsl_520_fn,
                          &_glsl_520_args, sizeof(_glsl_520_args));
}
#line 554
}

struct sh_sampler_obj {
    pl_filter filter;
    pl_shader_obj lut;
    pl_shader_obj pass2; // for pl_shader_sample_ortho
};

#define SCALER_LUT_SIZE     256
#define SCALER_LUT_CUTOFF   1e-3f

static void sh_sampler_uninit(pl_gpu gpu, void *ptr)
{
    struct sh_sampler_obj *obj = ptr;
    pl_shader_obj_destroy(&obj->lut);
    pl_shader_obj_destroy(&obj->pass2);
    pl_filter_free(&obj->filter);
    *obj = (struct sh_sampler_obj) {0};
}

static void fill_polar_lut(void *data, const struct sh_lut_params *params)
{
    const struct sh_sampler_obj *obj = params->priv;
    pl_filter filt = obj->filter;

    pl_assert(params->width == filt->params.lut_entries && params->comps == 1);
    memcpy(data, filt->weights, params->width * sizeof(float));
}

bool pl_shader_sample_polar(pl_shader sh, const struct pl_sample_src *src,
                            const struct pl_sample_filter_params *params)
{
    pl_assert(params);
    if (!params->filter.polar) {
        SH_FAIL(sh, "Trying to use polar sampling with a non-polar filter?");
        return false;
    }

    uint8_t cmask;
    float rx, ry, scalef;
    ident_t src_tex, pos, pt, scale;
    if (!setup_src(sh, src, &src_tex, &pos, &pt, &rx, &ry, &cmask, &scalef, false, FASTEST))
        return false;

    struct sh_sampler_obj *obj;
    obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER, struct sh_sampler_obj,
                 sh_sampler_uninit);
    if (!obj)
        return false;

    float inv_scale = 1.0 / PL_MIN(rx, ry);
    inv_scale = PL_MAX(inv_scale, 1.0);
    if (params->no_widening)
        inv_scale = 1.0;
    scale = sh_const_float(sh, "scale", scalef);

    struct pl_filter_config cfg = params->filter;
    cfg.antiring = PL_DEF(cfg.antiring, params->antiring);
    cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale;
    bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg);
    if (update) {
        pl_filter_free(&obj->filter);
        obj->filter = pl_filter_generate(sh->log, pl_filter_params(
            .config         = cfg,
            .lut_entries    = SCALER_LUT_SIZE,
            .cutoff         = SCALER_LUT_CUTOFF,
        ));

        if (!obj->filter) {
            // This should never happen, but just in case ..
            SH_FAIL(sh, "Failed initializing polar filter!");
            return false;
        }
    }

    describe_filter(sh, &cfg, "polar", rx, ry);
    GLSL("// pl_shader_sample_polar                     \n"
         "vec4 color = vec4(0.0);                       \n"
         "{                                             \n"
         "vec2 pos = "$", pt = "$";                     \n"
         "vec2 size = vec2(textureSize("$", 0));        \n"
         "vec2 fcoord = fract(pos * size - vec2(0.5));  \n"
         "vec2 base = pos - pt * fcoord;                \n"
         "vec2 center = base + pt * vec2(0.5);          \n"
         "ivec2 offset;                                 \n"
         "float w, d, wsum = 0.0;                       \n"
         "int idx;                                      \n"
         "vec4 c;                                       \n",
         pos, pt, src_tex);

    bool use_ar = cfg.antiring > 0;
    if (use_ar) {
{
    const struct __attribute__((__packed__)) {
    uint8_t cmask;
} _glsl_646_args = {
#line 648
        .cmask = cmask,
    };
#line 646
    size_t _glsl_646_fn(void *, pl_str *, const uint8_t *);
    pl_str_builder_append(sh->buffers[SH_BUF_BODY], _glsl_646_fn,
                          &_glsl_646_args, sizeof(_glsl_646_args));
}
#line 650
    }

    pl_gpu gpu = SH_GPU(sh);
    const int num_comps = __builtin_popcount(cmask);
    const bool dynamic_size = SH_PARAMS(sh).dynamic_constants ||
                              !gpu || !gpu->limits.array_size_constants;

    int bound   = ceil(obj->filter->radius);
    int offset  = bound - 1; // padding top/left
    int padding = offset + bound; // total padding

    // Determined experimentally on modern AMD and Nvidia hardware. 32 is a
    // good tradeoff for the horizontal work group size. Apart from that,
    // just use as many threads as possible.
    int bw = 32, bh = sh_glsl(sh).max_group_threads / bw;
    int sizew, sizeh, iw, ih;

    // Disable compute shaders after a (hard-coded) radius of 6, since the
    // gather kernel generally pulls ahead here.
    bool is_compute = !params->no_compute && sh_glsl(sh).compute;
    is_compute &= obj->filter->radius < 6.0;

    while (is_compute) {
        // We need to sample everything from base_min to base_max, so make sure
        // we have enough room in shmem. The extra margin on the ceilf guards
        // against floating point inaccuracy on near-integer scaling ratios.
        const float margin = 1e-5;
        sizew = iw = (int) ceilf(bw / rx - margin) + padding + 1;
        sizeh = ih = (int) ceilf(bh / ry - margin) + padding + 1;

        if (dynamic_size) {
            // Overallocate slightly to reduce recompilation overhead
            sizew = PL_ALIGN2(sizew, 8);
            sizeh = PL_ALIGN2(sizeh, 8);
        }

        const int shmem_req = (sizew * sizeh * num_comps + 2) * sizeof(float);
        if (shmem_req > sh_glsl(sh).max_shmem_size && bh > 1) {
            // Try again with smaller work group size
            bh >>= 1;
            continue;
        }

        is_compute = sh_try_compute(sh, bw, bh, false, shmem_req);
        break;
    }

    // Note: SH_LUT_LITERAL might be faster in some specific cases, but not by
    // much, and it's catastrophically slow on other platforms.
    ident_t lut = sh_lut(sh, sh_lut_params(
        .object     = &obj->lut,
        .lut_type   = SH_LUT_TEXTURE,
        .var_type   = PL_VAR_FLOAT,
        .method     = SH_LUT_LINEAR,
        .width      = SCALER_LUT_SIZE,
        .comps      = 1,
        .update     = update,
        .fill       = fill_polar_lut,
        .priv       = obj,
    ));

    if (!lut) {
        SH_FAIL(sh, "Failed initializing polar LUT!");
        return false;
    }

    ident_t radius_c = sh_const_float(sh, "radius", obj->filter->radius);
    ident_t in = sh_fresh(sh, "in");

    if (is_compute) {

        // Compute shader kernel
        GLSL("uvec2 base_id = uvec2(0u); \n");
        if (src->rect.x0 > src->rect.x1)
            GLSL("base_id.x = gl_WorkGroupSize.x - 1u; \n");
        if (src->rect.y0 > src->rect.y1)
            GLSL("base_id.y = gl_WorkGroupSize.y - 1u; \n");

        GLSLH("shared vec2 "$"_base; \n", in);
        GLSL("if (gl_LocalInvocationID.xy == base_id)               \n"
             "    "$"_base = base;                                  \n"
             "barrier();                                            \n"
             "ivec2 rel = ivec2(round((base - "$"_base) * size));   \n",
             in, in);

        ident_t sizew_c = sh_const(sh, (struct pl_shader_const) {
            .type = PL_VAR_SINT,
            .compile_time = true,
            .name = "sizew",
            .data = &sizew,
        });

        ident_t sizeh_c = sh_const(sh, (struct pl_shader_const) {
            .type = PL_VAR_SINT,
            .compile_time = true,
            .name = "sizeh",
            .data = &sizeh,
        });

        ident_t iw_c = sizew_c, ih_c = sizeh_c;
        if (dynamic_size) {
            iw_c = sh_const_int(sh, "iw", iw);
            ih_c = sh_const_int(sh, "ih", ih);
        }

        // Load all relevant texels into shmem
        GLSL("for (int y = int(gl_LocalInvocationID.y); y < "$"; y += %d) {     \n"
             "for (int x = int(gl_LocalInvocationID.x); x < "$"; x += %d) {     \n"
             "c = textureLod("$", "$"_base + pt * vec2(x - %d, y - %d), 0.0);   \n",
             ih_c, bh, iw_c, bw, src_tex, in, offset, offset);

        for (uint8_t comps = cmask; comps;) {
            uint8_t c = __builtin_ctz(comps);
            GLSLH("shared float "$"_%d["$" * "$"]; \n", in, c, sizeh_c, sizew_c);
            GLSL(""$"_%d["$" * y + x] = c[%d]; \n", in, c, sizew_c, c);
            comps &= ~(1 << c);
        }

        GLSL("}}                     \n"
             "barrier();             \n");

        // Dispatch the actual samples
        for (int y = 1 - bound; y <= bound; y++) {
            for (int x = 1 - bound; x <= bound; x++) {
                GLSL("idx = "$" * rel.y + rel.x + "$" * %d + %d; \n",
                     sizew_c, sizew_c, y + offset, x + offset);
                polar_sample(sh, obj->filter, src_tex, lut, radius_c,
                             x, y, cmask, in, use_ar, scale);
            }
        }
    } else {
        // Fragment shader sampling
        for (uint8_t comps = cmask; comps;) {
            uint8_t c = __builtin_ctz(comps);
            GLSL("vec4 "$"_%d; \n", in, c);
            comps &= ~(1 << c);
        }

        // For maximum efficiency, we want to use textureGather() if
        // possible, rather than direct sampling. Since this is not
        // always possible/sensible, we need to possibly intermix gathering
        // with regular sampling. This requires keeping track of which
        // pixels in the next row were already gathered by the previous
        // row.
        uint64_t gathered_cur = 0x0, gathered_next = 0x0;
        const float radius2 = PL_SQUARE(obj->filter->radius);
        const int base = bound - 1;

        if (base + bound >= 8 * sizeof(gathered_cur)) {
            SH_FAIL(sh, "Polar radius %f exceeds implementation capacity!",
                    obj->filter->radius);
            return false;
        }

        for (int y = 1 - bound; y <= bound; y++) {
            for (int x = 1 - bound; x <= bound; x++) {
                // Skip already gathered texels
                uint64_t bit = 1llu << (base + x);
                if (gathered_cur & bit)
                    continue;

                // Using texture gathering is only more efficient than direct
                // sampling in the case where we expect to be able to use all
                // four gathered texels, without having to discard any. So
                // only do it if we suspect it will be a win rather than a
                // loss.
                int xx = x*x, xx1 = (x+1)*(x+1);
                int yy = y*y, yy1 = (y+1)*(y+1);
                bool use_gather = PL_MAX(xx, xx1) + PL_MAX(yy, yy1) < radius2;
                use_gather &= PL_MAX(x, y) <= sh_glsl(sh).max_gather_offset;
                use_gather &= PL_MIN(x, y) >= sh_glsl(sh).min_gather_offset;
                use_gather &= !src->tex || src->tex->params.format->gatherable;

                // Gathering from components other than the R channel requires
                // support for GLSL 400, which introduces the overload of
                // textureGather* that allows specifying the component.
                //
                // This is also the minimum requirement if we don't know the
                // texture format capabilities, for the sampler2D interface
                if (cmask != 0x1 || !src->tex)
                    use_gather &= sh_glsl(sh).version >= 400;

                if (!use_gather) {
                    // Switch to direct sampling instead
                    polar_sample(sh, obj->filter, src_tex, lut, radius_c,
                                 x, y, cmask, NULL_IDENT, use_ar, scale);
                    continue;
                }

                // Gather the four surrounding texels simultaneously
                for (uint8_t comps = cmask; comps;) {
                    uint8_t c = __builtin_ctz(comps);
                    if (x || y) {
                        if (c) {
                            GLSL($"_%d = textureGatherOffset("$", "
                                 "center, ivec2(%d, %d), %d); \n",
                                 in, c, src_tex, x, y, c);
                        } else {
                            GLSL($"_0 = textureGatherOffset("$", "
                                 "center, ivec2(%d, %d)); \n",
                                 in, src_tex, x, y);
                        }
                    } else {
                        if (c) {
                            GLSL($"_%d = textureGather("$", center, %d); \n",
                                 in, c, src_tex, c);
                        } else {
                            GLSL($"_0 = textureGather("$", center); \n",
                                 in, src_tex);
                        }
                    }
                    comps &= ~(1 << c);
                }

                // Mix in all of the points with their weights
                for (int p = 0; p < 4; p++) {
                    // The four texels are gathered counterclockwise starting
                    // from the bottom left
                    static const int xo[4] = {0, 1, 1, 0};
                    static const int yo[4] = {1, 1, 0, 0};
                    if (x+xo[p] > bound || y+yo[p] > bound)
                        continue; // next subpixel

                    if (!yo[p] && (gathered_cur & (bit << xo[p])))
                        continue; // already sampled

                    GLSL("idx = %d;\n", p);
                    polar_sample(sh, obj->filter, src_tex, lut, radius_c,
                                 x+xo[p], y+yo[p], cmask, in, use_ar, scale);
                }

                // Mark the other next row's pixels as already gathered
                gathered_next |= bit | (bit << 1);
                x++; // skip adjacent pixel
            }

            // Prepare for new row
            gathered_cur = gathered_next;
            gathered_next = 0;
        }
    }

{
    const struct __attribute__((__packed__)) {
    ident_t scale;
    ident_t cfg_antiring;
    bool use_ar;
    uint8_t cmask;
    bool cmask_1_pl_channel_a;
} _glsl_892_args = {
#line 893
        .scale = scale,
#line 900
        .cfg_antiring = sh_const_float(sh, "cfg_antiring", cfg.antiring),
#line 894
        .use_ar = use_ar,
#line 895
        .cmask = cmask,
#line 903
        .cmask_1_pl_channel_a = !(cmask & (1 << PL_CHANNEL_A)),
    };
#line 892
    size_t _glsl_892_fn(void *, pl_str *, const uint8_t *);
    pl_str_builder_append(sh->buffers[SH_BUF_BODY], _glsl_892_fn,
                          &_glsl_892_args, sizeof(_glsl_892_args));
}
#line 906

    return true;
}

static void fill_ortho_lut(void *data, const struct sh_lut_params *params)
{
    const struct sh_sampler_obj *obj = params->priv;
    pl_filter filt = obj->filter;

    if (filt->radius == filt->radius_zero) {
        // Main lobe covers entire radius, so all weights are positive, meaning
        // we can use the linear resampling trick
        for (int n = 0; n < SCALER_LUT_SIZE; n++) {
            const float *weights = filt->weights + n * filt->row_stride;
            float *row = (float *) data + n * filt->row_stride;
            pl_assert(filt->row_size % 2 == 0);
            for (int i = 0; i < filt->row_size; i += 2) {
                const float w0 = weights[i], w1 = weights[i+1];
                assert(w0 + w1 >= 0.0f);
                row[i] = w0 + w1;
                row[i+1] = w1 / (w0 + w1);
            }
        }
    } else {
        size_t entries = SCALER_LUT_SIZE * filt->row_stride;
        pl_assert(params->width * params->height * params->comps == entries);
        memcpy(data, filt->weights, entries * sizeof(float));
    }
}

enum {
    SEP_VERT = 0,
    SEP_HORIZ,
    SEP_PASSES
};

bool pl_shader_sample_ortho2(pl_shader sh, const struct pl_sample_src *src,
                             const struct pl_sample_filter_params *params)
{
    pl_assert(params);
    if (params->filter.polar) {
        SH_FAIL(sh, "Trying to use separated sampling with a polar filter?");
        return false;
    }

    pl_gpu gpu = SH_GPU(sh);
    pl_assert(gpu);

    uint8_t comps;
    float ratio[SEP_PASSES], scale;
    ident_t src_tex, pos, pt;
    if (!setup_src(sh, src, &src_tex, &pos, &pt,
                   &ratio[SEP_HORIZ], &ratio[SEP_VERT],
                   &comps, &scale, false, LINEAR))
        return false;


    int pass;
    if (fabs(ratio[SEP_HORIZ] - 1.0f) < 1e-6f) {
        pass = SEP_VERT;
    } else if (fabs(ratio[SEP_VERT] - 1.0f) < 1e-6f) {
        pass = SEP_HORIZ;
    } else {
        SH_FAIL(sh, "Trying to use pl_shader_sample_ortho with a "
                "pl_sample_src that requires scaling in multiple directions "
                "(rx=%f, ry=%f), this is not possible!",
                ratio[SEP_HORIZ], ratio[SEP_VERT]);
        return false;
    }

    // We can store a separate sampler object per dimension, so dispatch the
    // right one. This is needed for two reasons:
    // 1. Anamorphic content can have a different scaling ratio for each
    //    dimension. In particular, you could be upscaling in one and
    //    downscaling in the other.
    // 2. After fixing the source for `setup_src`, we lose information about
    //    the scaling ratio of the other component. (Although this is only a
    //    minor reason and could easily be changed with some boilerplate)
    struct sh_sampler_obj *obj;
    obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER,
                 struct sh_sampler_obj, sh_sampler_uninit);
    if (!obj)
        return false;

    if (pass != 0) {
        obj = SH_OBJ(sh, &obj->pass2, PL_SHADER_OBJ_SAMPLER,
                     struct sh_sampler_obj, sh_sampler_uninit);
        assert(obj);
    }

    float inv_scale = 1.0 / ratio[pass];
    inv_scale = PL_MAX(inv_scale, 1.0);
    if (params->no_widening)
        inv_scale = 1.0;

    struct pl_filter_config cfg = params->filter;
    cfg.antiring = PL_DEF(cfg.antiring, params->antiring);
    cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale;
    bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg);

    if (update) {
        pl_filter_free(&obj->filter);
        obj->filter = pl_filter_generate(sh->log, pl_filter_params(
            .config             = cfg,
            .lut_entries        = SCALER_LUT_SIZE,
            .max_row_size       = gpu->limits.max_tex_2d_dim / 4,
            .row_stride_align   = 4,
        ));

        if (!obj->filter) {
            // This should never happen, but just in case ..
            SH_FAIL(sh, "Failed initializing separated filter!");
            return false;
        }
    }

    int N = obj->filter->row_size; // number of samples to convolve
    int width = obj->filter->row_stride / 4; // width of the LUT texture
    ident_t lut = sh_lut(sh, sh_lut_params(
        .object     = &obj->lut,
        .var_type   = PL_VAR_FLOAT,
        .method     = SH_LUT_LINEAR,
        .width      = width,
        .height     = SCALER_LUT_SIZE,
        .comps      = 4,
        .update     = update,
        .fill       = fill_ortho_lut,
        .priv       = obj,
    ));
    if (!lut) {
        SH_FAIL(sh, "Failed initializing separated LUT!");
        return false;
    }

    const int dir[SEP_PASSES][2] = {
        [SEP_HORIZ] = {1, 0},
        [SEP_VERT]  = {0, 1},
    };

    static const char *names[SEP_PASSES] = {
        [SEP_HORIZ] = "ortho (horiz)",
        [SEP_VERT]  = "ortho (vert)",
    };

    describe_filter(sh, &cfg, names[pass], ratio[pass], ratio[pass]);

    float denom = PL_MAX(1, width - 1); // avoid division by zero
    bool use_ar = cfg.antiring > 0 && ratio[pass] > 1.0;
    bool use_linear = obj->filter->radius == obj->filter->radius_zero;
    use_ar &= !use_linear; // filter has no negative weights

{
    const struct __attribute__((__packed__)) {
    float dir_pass_0;
    float dir_pass_1;
    float n_2_1;
    unsigned use_linear_2u_1u;
    float denom;
    ident_t pos;
    ident_t pt;
    ident_t src_tex;
    ident_t n;
    ident_t lut;
    ident_t cfg_antiring;
    ident_t scale;
    uint8_t comps;
    bool use_ar;
    bool use_linear;
} _glsl_1057_args = {
#line 1062
        .dir_pass_0 = dir[pass][0],
#line 1062
        .dir_pass_1 = dir[pass][1],
#line 1066
        .n_2_1 = N / 2 - 1,
#line 1075
        .use_linear_2u_1u = use_linear ? 2u : 1u,
#line 1077
        .denom = denom,
#line 1060
        .pos = pos,
#line 1060
        .pt = pt,
#line 1061
        .src_tex = src_tex,
#line 1075
        .n = sh_const_uint(sh, "n",  N),
#line 1077
        .lut = lut,
#line 1091
        .cfg_antiring = sh_const_float(sh, "cfg_antiring",  cfg.antiring),
#line 1092
        .scale = sh_const_float(sh, "scale",  scale),
#line 1069
        .comps = comps,
#line 1070
        .use_ar = use_ar,
#line 1079
        .use_linear = use_linear,
    };
#line 1057
    size_t _glsl_1057_fn(void *, pl_str *, const uint8_t *);
    pl_str_builder_append(sh->buffers[SH_BUF_BODY], _glsl_1057_fn,
                          &_glsl_1057_args, sizeof(_glsl_1057_args));
}
#line 1094

    return true;
}

const struct pl_distort_params pl_distort_default_params = { PL_DISTORT_DEFAULTS };

void pl_shader_distort(pl_shader sh, pl_tex src_tex, int out_w, int out_h,
                       const struct pl_distort_params *params)
{
    pl_assert(params);
    if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h))
        return;

    const int src_w = src_tex->params.w, src_h = src_tex->params.h;
    float rx = 1.0f, ry = 1.0f;
    if (src_w > src_h) {
        ry = (float) src_h / src_w;
    } else {
        rx = (float) src_w / src_h;
    }

    // Map from texel coordinates [0,1]² to aspect-normalized representation
    const pl_transform2x2 tex2norm = {
        .mat.m = {
            { 2 * rx, 0 },
            { 0, -2 * ry },
        },
        .c = { -rx, ry },
    };

    // Map from aspect-normalized representation to canvas coords [-1,1]²
    const float sx = params->unscaled ? (float) src_w / out_w : 1.0f;
    const float sy = params->unscaled ? (float) src_h / out_h : 1.0f;
    const pl_transform2x2 norm2canvas = {
        .mat.m = {
            { sx / rx, 0 },
            { 0, sy / ry },
        },
    };

    struct pl_transform2x2 transform = params->transform;
    pl_transform2x2_mul(&transform, &tex2norm);
    pl_transform2x2_rmul(&norm2canvas, &transform);

    if (params->constrain) {
        pl_rect2df bb = pl_transform2x2_bounds(&transform, &(pl_rect2df) {
            .x1 = 1, .y1 = 1,
        });
        const float k = fmaxf(fmaxf(pl_rect_w(bb), pl_rect_h(bb)), 2.0f);
        pl_transform2x2_scale(&transform, 2.0f / k);
    };

    // Bind the canvas coordinates as [-1,1]², flipped vertically to correspond
    // to normal mathematical axis conventions
    static const pl_rect2df canvas = {
        .x0 = -1.0f, .x1 =  1.0f,
        .y0 =  1.0f, .y1 = -1.0f,
    };

    ident_t pos = sh_attr_vec2(sh, "pos", &canvas);
    ident_t pt, tex = sh_bind(sh, src_tex, params->address_mode,
                              PL_TEX_SAMPLE_LINEAR, "tex", NULL, NULL, &pt);

    // Bind the inverse of the tex2canvas transform (i.e. canvas2tex)
    pl_transform2x2_invert(&transform);
    ident_t tf = sh_var(sh, (struct pl_shader_var) {
        .var  = pl_var_mat2("tf"),
        .data = PL_TRANSPOSE_2X2(transform.mat.m),
    });

    ident_t tf_c = sh_var(sh, (struct pl_shader_var) {
        .var  = pl_var_vec2("tf_c"),
        .data = transform.c,
    });

    // See pl_shader_sample_bicubic
    sh_describe(sh, "distortion");
{
    const struct __attribute__((__packed__)) {
    ident_t tf;
    ident_t pos;
    ident_t tf_c;
    ident_t pt;
    ident_t tex;
    bool params_bicubic;
    bool params_alpha_mode;
    bool params_alpha_mode_pl_alpha_premultiplied;
} _glsl_1171_args = {
#line 1174
        .tf = tf,
#line 1174
        .pos = pos,
#line 1174
        .tf_c = tf_c,
#line 1175
        .pt = pt,
#line 1177
        .tex = tex,
#line 1176
        .params_bicubic = params->bicubic,
#line 1200
        .params_alpha_mode = params->alpha_mode,
#line 1203
        .params_alpha_mode_pl_alpha_premultiplied = params->alpha_mode == PL_ALPHA_PREMULTIPLIED,
    };
#line 1171
    size_t _glsl_1171_fn(void *, pl_str *, const uint8_t *);
    pl_str_builder_append(sh->buffers[SH_BUF_BODY], _glsl_1171_fn,
                          &_glsl_1171_args, sizeof(_glsl_1171_args));
}
#line 1209

}

// Auto-generated template functions:
size_t _glsl_331_fn(void *alloc, pl_str *buf, const uint8_t *ptr);
size_t _glsl_331_fn(void *alloc, pl_str *buf, const uint8_t *ptr)
{
struct __attribute__((__packed__)) {
    ident_t pos;
    ident_t tex;
    ident_t pt;
    ident_t scale;
} vars;
memcpy(&vars, ptr, sizeof(vars));

#line 331
    pl_str_append_asprintf_c(alloc, buf,
        "\n"
        "vec4 color;\n"
        "{\n"
        "vec2 pos = _%hx;\n"
        "vec2 size = vec2(textureSize(_%hx, 0));\n"
        "vec2 frac  = fract(pos * size + vec2(0.5));\n"
        "vec2 frac2 = frac * frac;\n"
        "vec2 inv   = vec2(1.0) - frac;\n"
        "vec2 inv2  = inv * inv;\n"
        "\n"
        "vec2 w0 = 1.0/6.0 * inv2 * inv;\n"
        "vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac);\n"
        "vec2 w2 = 2.0/3.0 - 0.5 * inv2  * (2.0 - inv);\n"
        "vec2 w3 = 1.0/6.0 * frac2 * frac;\n"
        "vec4 g = vec4(w0 + w1, w2 + w3);\n"
        "vec4 h = vec4(w1, w3) / g + inv.xyxy;\n"
        "h.xy -= vec2(2.0);\n"
        "\n"
        "vec4 p = pos.xyxy + _%hx.xyxy * h;\n"
        "vec4 c00 = textureLod(_%hx, p.xy, 0.0);\n"
        "vec4 c01 = textureLod(_%hx, p.xw, 0.0);\n"
        "vec4 c0 = mix(c01, c00, g.y);\n"
        "vec4 c10 = textureLod(_%hx, p.zy, 0.0);\n"
        "vec4 c11 = textureLod(_%hx, p.zw, 0.0);\n"
        "vec4 c1 = mix(c11, c10, g.y);\n"
        "color = _%hx * mix(c1, c0, g.x);\n"
        "}\n",
        vars.pos,
        vars.tex,
        vars.pt,
        vars.tex,
        vars.tex,
        vars.tex,
        vars.tex,
        vars.scale
    );


return sizeof(vars);
}
size_t _glsl_375_fn(void *alloc, pl_str *buf, const uint8_t *ptr);
size_t _glsl_375_fn(void *alloc, pl_str *buf, const uint8_t *ptr)
{
struct __attribute__((__packed__)) {
    ident_t pos;
    ident_t tex;
    ident_t pt;
    ident_t scale;
} vars;
memcpy(&vars, ptr, sizeof(vars));

#line 375
    pl_str_append_asprintf_c(alloc, buf,
        "\n"
        "vec4 color;\n"
        "{\n"
        "vec2 pos  = _%hx;\n"
        "vec2 size = vec2(textureSize(_%hx, 0));\n"
        "vec2 frac = fract(pos * size + vec2(0.5));\n"
        "pos += _%hx * (smoothstep(0.0, 1.0, frac) - frac);\n"
        "color = _%hx * textureLod(_%hx, pos, 0.0);\n"
        "}\n",
        vars.pos,
        vars.tex,
        vars.pt,
        vars.scale,
        vars.tex
    );


return sizeof(vars);
}
size_t _glsl_401_fn(void *alloc, pl_str *buf, const uint8_t *ptr);
size_t _glsl_401_fn(void *alloc, pl_str *buf, const uint8_t *ptr)
{
struct __attribute__((__packed__)) {
    ident_t pos;
    ident_t tex;
    ident_t pt;
    ident_t scale;
} vars;
memcpy(&vars, ptr, sizeof(vars));

#line 401
    pl_str_append_asprintf_c(alloc, buf,
        "\n"
        "vec4 color;\n"
        "{\n"
        "vec2 pos  = _%hx;\n"
        "vec2 size = vec2(textureSize(_%hx, 0));\n"
        "vec2 off  = -fract(pos * size + vec2(0.5));\n"
        "vec2 off2 = -2.0 * off * off;\n"
        "\n"
        "vec2 w0 = exp(off2 + 4.0 * off - vec2(2.0));\n"
        "vec2 w1 = exp(off2);\n"
        "vec2 w2 = exp(off2 - 4.0 * off - vec2(2.0));\n"
        "vec2 w3 = exp(off2 - 8.0 * off - vec2(8.0));\n"
        "vec4 g = vec4(w0 + w1, w2 + w3);\n"
        "vec4 h = vec4(w1, w3) / g;\n"
        "h.xy -= vec2(1.0);\n"
        "h.zw += vec2(1.0);\n"
        "g.xy /= g.xy + g.zw; \n"
        "\n"
        "vec4 p = pos.xyxy + _%hx.xyxy * (h + off.xyxy);\n"
        "vec4 c00 = textureLod(_%hx, p.xy, 0.0);\n"
        "vec4 c01 = textureLod(_%hx, p.xw, 0.0);\n"
        "vec4 c0 = mix(c01, c00, g.y);\n"
        "vec4 c10 = textureLod(_%hx, p.zy, 0.0);\n"
        "vec4 c11 = textureLod(_%hx, p.zw, 0.0);\n"
        "vec4 c1 = mix(c11, c10, g.y);\n"
        "color = _%hx * mix(c1, c0, g.x);\n"
        "}\n",
        vars.pos,
        vars.tex,
        vars.pt,
        vars.tex,
        vars.tex,
        vars.tex,
        vars.tex,
        vars.scale
    );


return sizeof(vars);
}
size_t _glsl_442_fn(void *alloc, pl_str *buf, const uint8_t *ptr);
size_t _glsl_442_fn(void *alloc, pl_str *buf, const uint8_t *ptr)
{
struct __attribute__((__packed__)) {
    ident_t pos;
    ident_t tex;
    ident_t rx;
    ident_t ry;
    ident_t threshold;
    ident_t pt;
    ident_t scale;
    bool threshold_0;
} vars;
memcpy(&vars, ptr, sizeof(vars));

#line 442
    pl_str_append_asprintf_c(alloc, buf,
        "\n"
        "vec4 color;\n"
        "{\n"
        "vec2 pos = _%hx;\n"
        "vec2 size = vec2(textureSize(_%hx, 0));\n"
        "\n"
        "vec2 fcoord = fract(pos * size - vec2(0.5));\n"
        "float rx = _%hx;\n"
        "float ry = _%hx;\n"
        "vec2 coeff = (fcoord - vec2(0.5)) * vec2(rx, ry);\n"
        "coeff = clamp(coeff + vec2(0.5), 0.0, 1.0);\n",
        vars.pos,
        vars.tex,
        vars.rx,
        vars.ry
    );

if (vars.threshold_0) {
#line 454
    pl_str_append_asprintf_c(alloc, buf,
        "float thresh = _%hx;\n"
        "coeff = mix(coeff, vec2(0.0),\n"
        "lessThan(coeff, vec2(thresh)));\n"
        "coeff = mix(coeff, vec2(1.0),\n"
        "greaterThan(coeff, vec2(1.0 - thresh)));\n",
        vars.threshold
    );

}
#line 461
    pl_str_append_asprintf_c(alloc, buf,
        "\n"
        "pos += (coeff - fcoord) * _%hx;\n"
        "color = _%hx * textureLod(_%hx, pos, 0.0);\n"
        "}\n",
        vars.pt,
        vars.scale,
        vars.tex
    );


return sizeof(vars);
}
size_t _glsl_520_fn(void *alloc, pl_str *buf, const uint8_t *ptr);
size_t _glsl_520_fn(void *alloc, pl_str *buf, const uint8_t *ptr)
{
struct __attribute__((__packed__)) {
    int x;
    int y;
    float ar_radius;
    ident_t radius;
    ident_t lut;
    ident_t in;
    ident_t tex;
    ident_t scale;
    bool maybe_skippable;
    bool in_null_ident;
    uint8_t comp_mask;
    bool use_ar;
} vars;
memcpy(&vars, ptr, sizeof(vars));

#line 521
    pl_str_append_asprintf_c(alloc, buf,
        "offset = ivec2(%d, %d);\n"
        "d = length(vec2(offset) - fcoord);\n",
        vars.x,
        vars.y
    );

if (vars.maybe_skippable)
#line 524
    pl_str_append_asprintf_c(alloc, buf,
        "if (d < _%hx) {\n",
        vars.radius
    );

#line 525
    pl_str_append_asprintf_c(alloc, buf,
        "w = _%hx(d * 1.0 / _%hx);\n"
        "wsum += w;\n",
        vars.lut,
        vars.radius
    );

if (vars.in_null_ident) {
for (uint8_t _mask = vars.comp_mask, c; _mask && (c = __builtin_ctz(_mask), 1); _mask &= ~(1u << c))
#line 529
    pl_str_append_asprintf_c(alloc, buf,
        "c[%d] = _%hx_%d[idx];\n",
        c,
        vars.in,
        c
    );

} else {
#line 531
    pl_str_append_asprintf_c(alloc, buf,
        "c = textureLod(_%hx, base + pt * vec2(offset), 0.0);\n",
        vars.tex
    );

}
for (uint8_t _mask = vars.comp_mask, c; _mask && (c = __builtin_ctz(_mask), 1); _mask &= ~(1u << c))
#line 534
    pl_str_append_asprintf_c(alloc, buf,
        "color[%d] += w * c[%d];\n",
        c,
        c
    );

if (vars.use_ar) {
#line 536
    pl_str_append_asprintf_c(alloc, buf,
        "if (d <= float(%f)) {\n",
        vars.ar_radius
    );

for (uint8_t _mask = vars.comp_mask, c; _mask && (c = __builtin_ctz(_mask), 1); _mask &= ~(1u << c)) {
#line 538
    pl_str_append_asprintf_c(alloc, buf,
        "cc = vec2(_%hx * c[%d]);\n"
        "cc.x = 1.0 - cc.x;\n"
        "ww = cc + vec2(0.10);\n"
        "ww = ww * ww;\n"
        "ww = ww * ww;\n"
        "ww = ww * ww;\n"
        "ww = ww * ww;\n"
        "ww = ww * ww;\n"
        "ww = w * ww;\n"
        "ar%d += ww * cc;\n"
        "wwsum%d += ww;\n",
        vars.scale,
        c,
        c,
        c
    );

}
#line 550
    pl_str_append(alloc, buf, pl_str0(
        "}\n"
    ));

}
if (vars.maybe_skippable)
#line 553
    pl_str_append(alloc, buf, pl_str0(
        "}\n"
    ));


return sizeof(vars);
}
size_t _glsl_646_fn(void *alloc, pl_str *buf, const uint8_t *ptr);
size_t _glsl_646_fn(void *alloc, pl_str *buf, const uint8_t *ptr)
{
struct __attribute__((__packed__)) {
    uint8_t cmask;
} vars;
memcpy(&vars, ptr, sizeof(vars));

#line 647
    pl_str_append(alloc, buf, pl_str0(
        "vec2 ww, cc;\n"
    ));

for (uint8_t _mask = vars.cmask, c; _mask && (c = __builtin_ctz(_mask), 1); _mask &= ~(1u << c))
#line 649
    pl_str_append_asprintf_c(alloc, buf,
        "vec2 ar%d = vec2(0.0), wwsum%d = vec2(0.0);\n",
        c,
        c
    );


return sizeof(vars);
}
size_t _glsl_892_fn(void *alloc, pl_str *buf, const uint8_t *ptr);
size_t _glsl_892_fn(void *alloc, pl_str *buf, const uint8_t *ptr)
{
struct __attribute__((__packed__)) {
    ident_t scale;
    ident_t cfg_antiring;
    bool use_ar;
    uint8_t cmask;
    bool cmask_1_pl_channel_a;
} vars;
memcpy(&vars, ptr, sizeof(vars));

#line 893
    pl_str_append_asprintf_c(alloc, buf,
        "color = _%hx / wsum * color;\n",
        vars.scale
    );

if (vars.use_ar) {
for (uint8_t _mask = vars.cmask, c; _mask && (c = __builtin_ctz(_mask), 1); _mask &= ~(1u << c)) {
#line 896
    pl_str_append_asprintf_c(alloc, buf,
        "ww = ar%d / wwsum%d;\n"
        "ww.x = 1.0 - ww.x;\n"
        "w = clamp(color[%d], ww.x, ww.y);\n"
        "w = mix(w, dot(ww, vec2(0.5)), ww.x > ww.y);\n"
        "color[%d] = mix(color[%d], w, _%hx);\n",
        c,
        c,
        c,
        c,
        c,
        vars.cfg_antiring
    );

}
}
if (vars.cmask_1_pl_channel_a)
#line 904
    pl_str_append(alloc, buf, pl_str0(
        "color.a = 1.0;\n"
    ));

#line 905
    pl_str_append(alloc, buf, pl_str0(
        "}\n"
    ));


return sizeof(vars);
}
size_t _glsl_1057_fn(void *alloc, pl_str *buf, const uint8_t *ptr);
size_t _glsl_1057_fn(void *alloc, pl_str *buf, const uint8_t *ptr)
{
struct __attribute__((__packed__)) {
    float dir_pass_0;
    float dir_pass_1;
    float n_2_1;
    unsigned use_linear_2u_1u;
    float denom;
    ident_t pos;
    ident_t pt;
    ident_t src_tex;
    ident_t n;
    ident_t lut;
    ident_t cfg_antiring;
    ident_t scale;
    uint8_t comps;
    bool use_ar;
    bool use_linear;
} vars;
memcpy(&vars, ptr, sizeof(vars));

#line 1057
    pl_str_append_asprintf_c(alloc, buf,
        "\n"
        "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n"
        "{\n"
        "vec2 pos = _%hx, pt = _%hx;\n"
        "vec2 size = vec2(textureSize(_%hx, 0));\n"
        "vec2 dir = vec2(float(%f), float(%f));\n"
        "pt *= dir;\n"
        "vec2 fcoord2 = fract(pos * size - vec2(0.5));\n"
        "float fcoord = dot(fcoord2, dir);\n"
        "vec2 base = pos - fcoord * pt - pt * vec2(float(%f));\n"
        "vec4 ws;\n"
        "float off;\n"
        "%s c, ca = %s(0.0);\n",
        vars.pos,
        vars.pt,
        vars.src_tex,
        vars.dir_pass_0,
        vars.dir_pass_1,
        vars.n_2_1,
        sh_float_type(vars.comps),
        sh_float_type(vars.comps)
    );

if (vars.use_ar) {
#line 1071
    pl_str_append_asprintf_c(alloc, buf,
        "%s hi = %s(0.0);\n"
        "%s lo = %s(1e9);\n",
        sh_float_type(vars.comps),
        sh_float_type(vars.comps),
        sh_float_type(vars.comps),
        sh_float_type(vars.comps)
    );

}
#line 1074
    pl_str_append_asprintf_c(alloc, buf,
        "#pragma unroll 4\n"
        "for (uint n = 0u; n < _%hx; n += uint(%u)) {\n"
        "if (n %% 4u == 0u)\n"
        "ws = _%hx(vec2(float(n / 4u) / float(%f), fcoord));\n"
        "off = float(n);\n",
        vars.n,
        vars.use_linear_2u_1u,
        vars.lut,
        vars.denom
    );

if (vars.use_linear)
#line 1080
    pl_str_append(alloc, buf, pl_str0(
        "off += ws[n % 4u + 1u];\n"
    ));

#line 1081
    pl_str_append_asprintf_c(alloc, buf,
        "c = textureLod(_%hx, base + pt * off, 0.0).%s;\n",
        vars.src_tex,
        sh_swizzle(vars.comps)
    );

if (vars.use_ar) {
#line 1083
    pl_str_append_asprintf_c(alloc, buf,
        "if (n == _%hx / 2u - 1u || n == _%hx / 2u) {\n"
        "lo = min(lo, c);\n"
        "hi = max(hi, c);\n"
        "}\n",
        vars.n,
        vars.n
    );

}
#line 1088
    pl_str_append(alloc, buf, pl_str0(
        "ca += ws[n % 4u] * c;\n"
        "}\n"
    ));

if (vars.use_ar)
#line 1091
    pl_str_append_asprintf_c(alloc, buf,
        "ca = mix(ca, clamp(ca, lo, hi), _%hx);\n",
        vars.cfg_antiring
    );

#line 1092
    pl_str_append_asprintf_c(alloc, buf,
        "color.%s = _%hx * ca;\n"
        "}\n",
        sh_swizzle(vars.comps),
        vars.scale
    );


return sizeof(vars);
}
size_t _glsl_1171_fn(void *alloc, pl_str *buf, const uint8_t *ptr);
size_t _glsl_1171_fn(void *alloc, pl_str *buf, const uint8_t *ptr)
{
struct __attribute__((__packed__)) {
    ident_t tf;
    ident_t pos;
    ident_t tf_c;
    ident_t pt;
    ident_t tex;
    bool params_bicubic;
    bool params_alpha_mode;
    bool params_alpha_mode_pl_alpha_premultiplied;
} vars;
memcpy(&vars, ptr, sizeof(vars));

#line 1171
    pl_str_append_asprintf_c(alloc, buf,
        "\n"
        "vec4 color;\n"
        "{\n"
        "vec2 pos = _%hx * _%hx + _%hx;\n"
        "vec2 pt = _%hx;\n",
        vars.tf,
        vars.pos,
        vars.tf_c,
        vars.pt
    );

if (vars.params_bicubic) {
#line 1177
    pl_str_append_asprintf_c(alloc, buf,
        "vec2 size = vec2(textureSize(_%hx, 0));\n"
        "vec2 frac  = fract(pos * size + vec2(0.5));\n"
        "vec2 frac2 = frac * frac;\n"
        "vec2 inv   = vec2(1.0) - frac;\n"
        "vec2 inv2  = inv * inv;\n"
        "vec2 w0 = 1.0/6.0 * inv2 * inv;\n"
        "vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac);\n"
        "vec2 w2 = 2.0/3.0 - 0.5 * inv2  * (2.0 - inv);\n"
        "vec2 w3 = 1.0/6.0 * frac2 * frac;\n"
        "vec4 g = vec4(w0 + w1, w2 + w3);\n"
        "vec4 h = vec4(w1, w3) / g + inv.xyxy;\n"
        "h.xy -= vec2(2.0);\n"
        "vec4 p = pos.xyxy + pt.xyxy * h;\n"
        "vec4 c00 = textureLod(_%hx, p.xy, 0.0);\n"
        "vec4 c01 = textureLod(_%hx, p.xw, 0.0);\n"
        "vec4 c0 = mix(c01, c00, g.y);\n"
        "vec4 c10 = textureLod(_%hx, p.zy, 0.0);\n"
        "vec4 c11 = textureLod(_%hx, p.zw, 0.0);\n"
        "vec4 c1 = mix(c11, c10, g.y);\n"
        "color = mix(c1, c0, g.x);\n",
        vars.tex,
        vars.tex,
        vars.tex,
        vars.tex,
        vars.tex
    );

} else {
#line 1198
    pl_str_append_asprintf_c(alloc, buf,
        "color = texture(_%hx, pos);\n",
        vars.tex
    );

}
if (vars.params_alpha_mode) {
#line 1201
    pl_str_append(alloc, buf, pl_str0(
        "vec2 border = min(pos, vec2(1.0) - pos);\n"
        "border = smoothstep(vec2(0.0), pt, border);\n"
    ));

if (vars.params_alpha_mode_pl_alpha_premultiplied)
#line 1204
    pl_str_append(alloc, buf, pl_str0(
        "color.rgba *= border.x * border.y;\n"
    ));

else
#line 1206
    pl_str_append(alloc, buf, pl_str0(
        "color.a *= border.x * border.y;\n"
    ));

}
#line 1208
    pl_str_append(alloc, buf, pl_str0(
        "}\n"
    ));


return sizeof(vars);
}