Remove in-place translations support.
[dcpomatic.git] / src / lib / image.cc
index 57c152f137a396b471ce71b8bbaa842ccd9dce86..2167918f8d1055c1c7fff274021797fb767b84bc 100644 (file)
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2012-2016 Carl Hetherington <cth@carlh.net>
+    Copyright (C) 2012-2021 Carl Hetherington <cth@carlh.net>
 
     This file is part of DCP-o-matic.
 
 
 */
 
+
 /** @file src/image.cc
  *  @brief A class to describe a video image.
  */
 
-#include "image.h"
-#include "exceptions.h"
-#include "timer.h"
-#include "rect.h"
-#include "util.h"
+
 #include "compose.hpp"
+#include "dcpomatic_assert.h"
 #include "dcpomatic_socket.h"
+#include "enum_indexed_vector.h"
+#include "exceptions.h"
+#include "image.h"
+#include "maths_util.h"
+#include "memory_util.h"
+#include "rect.h"
+#include "timer.h"
 #include <dcp/rgb_xyz.h>
 #include <dcp/transfer_function.h>
+#include <dcp/warnings.h>
+LIBDCP_DISABLE_WARNINGS
 extern "C" {
-#include <libswscale/swscale.h>
-#include <libavutil/pixfmt.h>
-#include <libavutil/pixdesc.h>
 #include <libavutil/frame.h>
+#include <libavutil/pixdesc.h>
+#include <libavutil/pixfmt.h>
+#include <libswscale/swscale.h>
 }
-#include <png.h>
+LIBDCP_ENABLE_WARNINGS
 #if HAVE_VALGRIND_MEMCHECK_H
 #include <valgrind/memcheck.h>
 #endif
 #include <iostream>
 
+
 #include "i18n.h"
 
-using std::string;
-using std::min;
-using std::max;
-using std::cout;
+
 using std::cerr;
+using std::cout;
 using std::list;
+using std::make_shared;
+using std::max;
+using std::min;
 using std::runtime_error;
-using boost::shared_ptr;
+using std::shared_ptr;
+using std::string;
 using dcp::Size;
 
+
+/** The memory alignment, in bytes, used for each row of an image if Alignment::PADDED is requested */
+int constexpr ALIGNMENT = 64;
+
+/* U/V black value for 8-bit colour */
+static uint8_t const eight_bit_uv =    (1 << 7) - 1;
+/* U/V black value for 9-bit colour */
+static uint16_t const nine_bit_uv =    (1 << 8) - 1;
+/* U/V black value for 10-bit colour */
+static uint16_t const ten_bit_uv =     (1 << 9) - 1;
+/* U/V black value for 16-bit colour */
+static uint16_t const sixteen_bit_uv = (1 << 15) - 1;
+
+
 int
 Image::vertical_factor (int n) const
 {
@@ -62,12 +86,12 @@ Image::vertical_factor (int n) const
                return 1;
        }
 
-       AVPixFmtDescriptor const * d = av_pix_fmt_desc_get(_pixel_format);
+       auto d = av_pix_fmt_desc_get(_pixel_format);
        if (!d) {
                throw PixelFormatError ("line_factor()", _pixel_format);
        }
 
-       return pow (2.0f, d->log2_chroma_h);
+       return lrintf(powf(2.0f, d->log2_chroma_h));
 }
 
 int
@@ -77,14 +101,15 @@ Image::horizontal_factor (int n) const
                return 1;
        }
 
-       AVPixFmtDescriptor const * d = av_pix_fmt_desc_get(_pixel_format);
+       auto d = av_pix_fmt_desc_get(_pixel_format);
        if (!d) {
                throw PixelFormatError ("sample_size()", _pixel_format);
        }
 
-       return pow (2.0f, d->log2_chroma_w);
+       return lrintf(powf(2.0f, d->log2_chroma_w));
 }
 
+
 /** @param n Component index.
  *  @return Number of samples (i.e. pixels, unless sub-sampled) in each direction for this component.
  */
@@ -92,24 +117,25 @@ dcp::Size
 Image::sample_size (int n) const
 {
        return dcp::Size (
-               lrint (ceil (static_cast<double>(size().width) / horizontal_factor (n))),
-               lrint (ceil (static_cast<double>(size().height) / vertical_factor (n)))
+               lrint (ceil(static_cast<double>(size().width) / horizontal_factor(n))),
+               lrint (ceil(static_cast<double>(size().height) / vertical_factor(n)))
                );
 }
 
+
 /** @return Number of planes */
 int
 Image::planes () const
 {
-       AVPixFmtDescriptor const * d = av_pix_fmt_desc_get(_pixel_format);
-       if (!d) {
-               throw PixelFormatError ("planes()", _pixel_format);
-       }
-
        if (_pixel_format == AV_PIX_FMT_PAL8) {
                return 2;
        }
 
+       auto d = av_pix_fmt_desc_get(_pixel_format);
+       if (!d) {
+               throw PixelFormatError ("planes()", _pixel_format);
+       }
+
        if ((d->flags & AV_PIX_FMT_FLAG_PLANAR) == 0) {
                return 1;
        }
@@ -117,37 +143,90 @@ Image::planes () const
        return d->nb_components;
 }
 
+
+static
+int
+round_width_for_subsampling (int p, AVPixFmtDescriptor const * desc)
+{
+       return p & ~ ((1 << desc->log2_chroma_w) - 1);
+}
+
+
+static
+int
+round_height_for_subsampling (int p, AVPixFmtDescriptor const * desc)
+{
+       return p & ~ ((1 << desc->log2_chroma_h) - 1);
+}
+
+
 /** Crop this image, scale it to `inter_size' and then place it in a black frame of `out_size'.
  *  @param crop Amount to crop by.
  *  @param inter_size Size to scale the cropped image to.
  *  @param out_size Size of output frame; if this is larger than inter_size there will be black padding.
  *  @param yuv_to_rgb YUV to RGB transformation to use, if required.
+ *  @param video_range Video range of the image.
  *  @param out_format Output pixel format.
  *  @param out_aligned true to make the output image aligned.
+ *  @param out_video_range Video range to use for the output image.
  *  @param fast Try to be fast at the possible expense of quality; at present this means using
  *  fast bilinear rather than bicubic scaling.
  */
 shared_ptr<Image>
 Image::crop_scale_window (
-       Crop crop, dcp::Size inter_size, dcp::Size out_size, dcp::YUVToRGB yuv_to_rgb, VideoRange video_range, AVPixelFormat out_format, bool out_aligned, bool fast
+       Crop crop,
+       dcp::Size inter_size,
+       dcp::Size out_size,
+       dcp::YUVToRGB yuv_to_rgb,
+       VideoRange video_range,
+       AVPixelFormat out_format,
+       VideoRange out_video_range,
+       Alignment out_alignment,
+       bool fast
        ) const
 {
        /* Empirical testing suggests that sws_scale() will crash if
-          the input image is not aligned.
+          the input image is not padded.
        */
-       DCPOMATIC_ASSERT (aligned ());
+       DCPOMATIC_ASSERT (alignment() == Alignment::PADDED);
 
        DCPOMATIC_ASSERT (out_size.width >= inter_size.width);
        DCPOMATIC_ASSERT (out_size.height >= inter_size.height);
 
-       shared_ptr<Image> out (new Image(out_format, out_size, out_aligned));
+       auto out = make_shared<Image>(out_format, out_size, out_alignment);
        out->make_black ();
 
+       auto in_desc = av_pix_fmt_desc_get (_pixel_format);
+       if (!in_desc) {
+               throw PixelFormatError ("crop_scale_window()", _pixel_format);
+       }
+
+       /* Round down so that we crop only the number of pixels that is straightforward
+        * considering any subsampling.
+        */
+       Crop corrected_crop(
+               round_width_for_subsampling(crop.left, in_desc),
+               round_width_for_subsampling(crop.right, in_desc),
+               round_height_for_subsampling(crop.top, in_desc),
+               round_height_for_subsampling(crop.bottom, in_desc)
+               );
+
+       /* Also check that we aren't cropping more image than there actually is */
+       if ((corrected_crop.left + corrected_crop.right) >= (size().width - 4)) {
+               corrected_crop.left = 0;
+               corrected_crop.right = size().width - 4;
+       }
+
+       if ((corrected_crop.top + corrected_crop.bottom) >= (size().height - 4)) {
+               corrected_crop.top = 0;
+               corrected_crop.bottom = size().height - 4;
+       }
+
        /* Size of the image after any crop */
-       dcp::Size const cropped_size = crop.apply (size ());
+       auto const cropped_size = corrected_crop.apply (size());
 
        /* Scale context for a scale from cropped_size to inter_size */
-       struct SwsContext* scale_context = sws_getContext (
+       auto scale_context = sws_getContext (
                        cropped_size.width, cropped_size.height, pixel_format(),
                        inter_size.width, inter_size.height, out_format,
                        fast ? SWS_FAST_BILINEAR : SWS_BICUBIC, 0, 0, 0
@@ -157,11 +236,11 @@ Image::crop_scale_window (
                throw runtime_error (N_("Could not allocate SwsContext"));
        }
 
-       DCPOMATIC_ASSERT (yuv_to_rgb < dcp::YUV_TO_RGB_COUNT);
-       int const lut[dcp::YUV_TO_RGB_COUNT] = {
-               SWS_CS_ITU601,
-               SWS_CS_ITU709
-       };
+       DCPOMATIC_ASSERT (yuv_to_rgb < dcp::YUVToRGB::COUNT);
+       EnumIndexedVector<int, dcp::YUVToRGB> lut;
+       lut[dcp::YUVToRGB::REC601] = SWS_CS_ITU601;
+       lut[dcp::YUVToRGB::REC709] = SWS_CS_ITU709;
+       lut[dcp::YUVToRGB::REC2020] = SWS_CS_BT2020;
 
        /* The 3rd parameter here is:
           0 -> source range MPEG (i.e. "video", 16-235)
@@ -171,45 +250,37 @@ Image::crop_scale_window (
           1 -> destination range JPEG (i.e. "full", 0-255)
 
           But remember: sws_setColorspaceDetails ignores these
-          parameters unless the corresponding image isYUV or isGray.
-          (If it's neither, it uses video range).
+          parameters unless the both source and destination images
+          are isYUV or isGray.  (If either is not, it uses video range).
        */
        sws_setColorspaceDetails (
                scale_context,
-               sws_getCoefficients (lut[yuv_to_rgb]), video_range == VIDEO_RANGE_VIDEO ? 0 : 1,
-               sws_getCoefficients (lut[yuv_to_rgb]), 1,
+               sws_getCoefficients(lut[yuv_to_rgb]), video_range == VideoRange::VIDEO ? 0 : 1,
+               sws_getCoefficients(lut[yuv_to_rgb]), out_video_range == VideoRange::VIDEO ? 0 : 1,
                0, 1 << 16, 1 << 16
                );
 
-       AVPixFmtDescriptor const * in_desc = av_pix_fmt_desc_get (_pixel_format);
-       if (!in_desc) {
-               throw PixelFormatError ("crop_scale_window()", _pixel_format);
-       }
-
        /* Prepare input data pointers with crop */
        uint8_t* scale_in_data[planes()];
        for (int c = 0; c < planes(); ++c) {
-               /* To work out the crop in bytes, start by multiplying
-                  the crop by the (average) bytes per pixel.  Then
-                  round down so that we don't crop a subsampled pixel until
-                  we've cropped all of its Y-channel pixels.
-               */
-               int const x = lrintf (bytes_per_pixel(c) * crop.left) & ~ ((int) in_desc->log2_chroma_w);
-               scale_in_data[c] = data()[c] + x + stride()[c] * (crop.top / vertical_factor(c));
+               int const x = lrintf(bytes_per_pixel(c) * corrected_crop.left);
+               scale_in_data[c] = data()[c] + x + stride()[c] * (corrected_crop.top / vertical_factor(c));
        }
 
-       /* Corner of the image within out_size */
-       Position<int> const corner ((out_size.width - inter_size.width) / 2, (out_size.height - inter_size.height) / 2);
-
-       AVPixFmtDescriptor const * out_desc = av_pix_fmt_desc_get (out_format);
+       auto out_desc = av_pix_fmt_desc_get (out_format);
        if (!out_desc) {
                throw PixelFormatError ("crop_scale_window()", out_format);
        }
 
+       /* Corner of the image within out_size */
+       Position<int> const corner (
+               round_width_for_subsampling((out_size.width - inter_size.width) / 2, out_desc),
+               round_height_for_subsampling((out_size.height - inter_size.height) / 2, out_desc)
+               );
+
        uint8_t* scale_out_data[out->planes()];
        for (int c = 0; c < out->planes(); ++c) {
-               /* See the note in the crop loop above */
-               int const x = lrintf (out->bytes_per_pixel(c) * corner.x) & ~ ((int) out_desc->log2_chroma_w);
+               int const x = lrintf(out->bytes_per_pixel(c) * corner.x);
                scale_out_data[c] = out->data()[c] + x + out->stride()[c] * (corner.y / out->vertical_factor(c));
        }
 
@@ -222,51 +293,71 @@ Image::crop_scale_window (
 
        sws_freeContext (scale_context);
 
-       if (crop != Crop() && cropped_size == inter_size && _pixel_format == out_format) {
-               /* We are cropping without any scaling or pixel format conversion, so FFmpeg may have left some
-                  data behind in our image.  Clear it out.  It may get to the point where we should just stop
-                  trying to be clever with cropping.
-               */
-               out->make_part_black (corner.x + cropped_size.width, out_size.width - cropped_size.width);
+       /* There are some cases where there will be unwanted image data left in the image at this point:
+        *
+        * 1. When we are cropping without any scaling or pixel format conversion.
+        * 2. When we are scaling to certain sizes and placing the result into a larger
+        *    black frame.
+        *
+        * Clear out the sides of the image to take care of those cases.
+        */
+       auto const pad = (out_size.width - inter_size.width) / 2;
+       out->make_part_black(0, pad);
+       out->make_part_black(corner.x + inter_size.width, pad);
+
+       if (
+               video_range == VideoRange::VIDEO &&
+               out_video_range == VideoRange::FULL &&
+               av_pix_fmt_desc_get(_pixel_format)->flags & AV_PIX_FMT_FLAG_RGB
+          ) {
+               /* libswscale will not convert video range for RGB sources, so we have to do it ourselves */
+               out->video_range_to_full_range ();
        }
 
        return out;
 }
 
+
 shared_ptr<Image>
-Image::convert_pixel_format (dcp::YUVToRGB yuv_to_rgb, AVPixelFormat out_format, bool out_aligned, bool fast) const
+Image::convert_pixel_format (dcp::YUVToRGB yuv_to_rgb, AVPixelFormat out_format, Alignment out_alignment, bool fast) const
 {
-       return scale(size(), yuv_to_rgb, out_format, out_aligned, fast);
+       return scale(size(), yuv_to_rgb, out_format, out_alignment, fast);
 }
 
+
 /** @param out_size Size to scale to.
  *  @param yuv_to_rgb YUVToRGB transform transform to use, if required.
  *  @param out_format Output pixel format.
- *  @param out_aligned true to make an aligned output image.
+ *  @param out_alignment Output alignment.
  *  @param fast Try to be fast at the possible expense of quality; at present this means using
  *  fast bilinear rather than bicubic scaling.
  */
 shared_ptr<Image>
-Image::scale (dcp::Size out_size, dcp::YUVToRGB yuv_to_rgb, AVPixelFormat out_format, bool out_aligned, bool fast) const
+Image::scale (dcp::Size out_size, dcp::YUVToRGB yuv_to_rgb, AVPixelFormat out_format, Alignment out_alignment, bool fast) const
 {
        /* Empirical testing suggests that sws_scale() will crash if
-          the input image is not aligned.
+          the input image alignment is not PADDED.
        */
-       DCPOMATIC_ASSERT (aligned ());
-
-       shared_ptr<Image> scaled (new Image (out_format, out_size, out_aligned));
-
-       struct SwsContext* scale_context = sws_getContext (
+       DCPOMATIC_ASSERT (alignment() == Alignment::PADDED);
+       DCPOMATIC_ASSERT(size().width > 0);
+       DCPOMATIC_ASSERT(size().height > 0);
+       DCPOMATIC_ASSERT(out_size.width > 0);
+       DCPOMATIC_ASSERT(out_size.height > 0);
+
+       auto scaled = make_shared<Image>(out_format, out_size, out_alignment);
+       auto scale_context = sws_getContext (
                size().width, size().height, pixel_format(),
                out_size.width, out_size.height, out_format,
                (fast ? SWS_FAST_BILINEAR : SWS_BICUBIC) | SWS_ACCURATE_RND, 0, 0, 0
                );
 
-       DCPOMATIC_ASSERT (yuv_to_rgb < dcp::YUV_TO_RGB_COUNT);
-       int const lut[dcp::YUV_TO_RGB_COUNT] = {
-               SWS_CS_ITU601,
-               SWS_CS_ITU709
-       };
+       DCPOMATIC_ASSERT(scale_context);
+
+       DCPOMATIC_ASSERT (yuv_to_rgb < dcp::YUVToRGB::COUNT);
+       EnumIndexedVector<int, dcp::YUVToRGB> lut;
+       lut[dcp::YUVToRGB::REC601] = SWS_CS_ITU601;
+       lut[dcp::YUVToRGB::REC709] = SWS_CS_ITU709;
+       lut[dcp::YUVToRGB::REC2020] = SWS_CS_BT2020;
 
        /* The 3rd parameter here is:
           0 -> source range MPEG (i.e. "video", 16-235)
@@ -281,8 +372,8 @@ Image::scale (dcp::Size out_size, dcp::YUVToRGB yuv_to_rgb, AVPixelFormat out_fo
        */
        sws_setColorspaceDetails (
                scale_context,
-               sws_getCoefficients (lut[yuv_to_rgb]), 0,
-               sws_getCoefficients (lut[yuv_to_rgb]), 0,
+               sws_getCoefficients(lut[yuv_to_rgb]), 0,
+               sws_getCoefficients(lut[yuv_to_rgb]), 0,
                0, 1 << 16, 1 << 16
                );
 
@@ -298,13 +389,14 @@ Image::scale (dcp::Size out_size, dcp::YUVToRGB yuv_to_rgb, AVPixelFormat out_fo
        return scaled;
 }
 
+
 /** Blacken a YUV image whose bits per pixel is rounded up to 16 */
 void
 Image::yuv_16_black (uint16_t v, bool alpha)
 {
        memset (data()[0], 0, sample_size(0).height * stride()[0]);
        for (int i = 1; i < 3; ++i) {
-               int16_t* p = reinterpret_cast<int16_t*> (data()[i]);
+               auto p = reinterpret_cast<int16_t*> (data()[i]);
                int const lines = sample_size(i).height;
                for (int y = 0; y < lines; ++y) {
                        /* We divide by 2 here because we are writing 2 bytes at a time */
@@ -320,15 +412,28 @@ Image::yuv_16_black (uint16_t v, bool alpha)
        }
 }
 
+
 uint16_t
 Image::swap_16 (uint16_t v)
 {
        return ((v >> 8) & 0xff) | ((v & 0xff) << 8);
 }
 
+
 void
-Image::make_part_black (int x, int w)
+Image::make_part_black (int const start, int const width)
 {
+       auto y_part = [&]() {
+               int const bpp = bytes_per_pixel(0);
+               int const h = sample_size(0).height;
+               int const s = stride()[0];
+               auto p = data()[0];
+               for (int y = 0; y < h; ++y) {
+                       memset (p + start * bpp, 0, width * bpp);
+                       p += s;
+               }
+       };
+
        switch (_pixel_format) {
        case AV_PIX_FMT_RGB24:
        case AV_PIX_FMT_ARGB:
@@ -345,29 +450,65 @@ Image::make_part_black (int x, int w)
                int const s = stride()[0];
                uint8_t* p = data()[0];
                for (int y = 0; y < h; y++) {
-                       memset (p + x * bpp, 0, w * bpp);
+                       memset (p + start * bpp, 0, width * bpp);
                        p += s;
                }
                break;
        }
-
+       case AV_PIX_FMT_YUV420P:
+       {
+               y_part ();
+               for (int i = 1; i < 3; ++i) {
+                       auto p = data()[i];
+                       int const h = sample_size(i).height;
+                       for (int y = 0; y < h; ++y) {
+                               for (int x = start / 2; x < (start + width) / 2; ++x) {
+                                       p[x] = eight_bit_uv;
+                               }
+                               p += stride()[i];
+                       }
+               }
+               break;
+       }
+       case AV_PIX_FMT_YUV422P10LE:
+       {
+               y_part ();
+               for (int i = 1; i < 3; ++i) {
+                       auto p = reinterpret_cast<int16_t*>(data()[i]);
+                       int const h = sample_size(i).height;
+                       for (int y = 0; y < h; ++y) {
+                               for (int x = start / 2; x < (start + width) / 2; ++x) {
+                                       p[x] = ten_bit_uv;
+                               }
+                               p += stride()[i] / 2;
+                       }
+               }
+               break;
+       }
+       case AV_PIX_FMT_YUV444P10LE:
+       {
+               y_part();
+               for (int i = 1; i < 3; ++i) {
+                       auto p = reinterpret_cast<int16_t*>(data()[i]);
+                       int const h = sample_size(i).height;
+                       for (int y = 0; y < h; ++y) {
+                               for (int x = start; x < (start + width); ++x) {
+                                       p[x] = ten_bit_uv;
+                               }
+                               p += stride()[i] / 2;
+                       }
+               }
+               break;
+       }
        default:
                throw PixelFormatError ("make_part_black()", _pixel_format);
        }
 }
 
+
 void
 Image::make_black ()
 {
-       /* U/V black value for 8-bit colour */
-       static uint8_t const eight_bit_uv =     (1 << 7) - 1;
-       /* U/V black value for 9-bit colour */
-       static uint16_t const nine_bit_uv =     (1 << 8) - 1;
-       /* U/V black value for 10-bit colour */
-       static uint16_t const ten_bit_uv =      (1 << 9) - 1;
-       /* U/V black value for 16-bit colour */
-       static uint16_t const sixteen_bit_uv =  (1 << 15) - 1;
-
        switch (_pixel_format) {
        case AV_PIX_FMT_YUV420P:
        case AV_PIX_FMT_YUV422P:
@@ -480,26 +621,345 @@ Image::make_black ()
        }
 }
 
+
 void
 Image::make_transparent ()
 {
-       if (_pixel_format != AV_PIX_FMT_BGRA && _pixel_format != AV_PIX_FMT_RGBA) {
+       if (_pixel_format != AV_PIX_FMT_BGRA && _pixel_format != AV_PIX_FMT_RGBA && _pixel_format != AV_PIX_FMT_RGBA64BE) {
                throw PixelFormatError ("make_transparent()", _pixel_format);
        }
 
        memset (data()[0], 0, sample_size(0).height * stride()[0]);
 }
 
+
+struct TargetParams
+{
+       int start_x;
+       int start_y;
+       dcp::Size size;
+       uint8_t* const* data;
+       int const* stride;
+       int bpp;
+
+       uint8_t* line_pointer(int y) const {
+               return data[0] + y * stride[0] + start_x * bpp;
+       }
+};
+
+
+/** Parameters of the other image (the one being blended onto the target) when target and other are RGB */
+struct OtherRGBParams
+{
+       int start_x;
+       int start_y;
+       dcp::Size size;
+       uint8_t* const* data;
+       int const* stride;
+       int bpp;
+
+       uint8_t* line_pointer(int y) const {
+               return data[0] + y * stride[0];
+       }
+
+       float alpha_divisor() const {
+               return pow(2, bpp * 2) - 1;
+       }
+};
+
+
+/** Parameters of the other image (the one being blended onto the target) when target and other are YUV */
+struct OtherYUVParams
+{
+       int start_x;
+       int start_y;
+       dcp::Size size;
+       uint8_t* const* data;
+       int const* stride;
+
+       uint8_t* const* alpha_data;
+       int const* alpha_stride;
+       int alpha_bpp;
+};
+
+
+template <class OtherType>
+void
+alpha_blend_onto_rgb24(TargetParams const& target, OtherRGBParams const& other, int red, int blue, std::function<float (OtherType*)> get, int value_divisor)
+{
+       /* Going onto RGB24.  First byte is red, second green, third blue */
+       auto const alpha_divisor = other.alpha_divisor();
+       for (int ty = target.start_y, oy = other.start_y; ty < target.size.height && oy < other.size.height; ++ty, ++oy) {
+               auto tp = target.line_pointer(ty);
+               auto op = reinterpret_cast<OtherType*>(other.line_pointer(oy));
+               for (int tx = target.start_x, ox = other.start_x; tx < target.size.width && ox < other.size.width; ++tx, ++ox) {
+                       float const alpha = get(op + 3) / alpha_divisor;
+                       tp[0] = (get(op + red) / value_divisor) * alpha + tp[0] * (1 - alpha);
+                       tp[1] = (get(op + 1) / value_divisor) * alpha + tp[1] * (1 - alpha);
+                       tp[2] = (get(op + blue) / value_divisor) * alpha + tp[2] * (1 - alpha);
+
+                       tp += target.bpp;
+                       op += other.bpp / sizeof(OtherType);
+               }
+       }
+}
+
+
+template <class OtherType>
+void
+alpha_blend_onto_bgra(TargetParams const& target, OtherRGBParams const& other, int red, int blue, std::function<float (OtherType*)> get, int value_divisor)
+{
+       auto const alpha_divisor = other.alpha_divisor();
+       for (int ty = target.start_y, oy = other.start_y; ty < target.size.height && oy < other.size.height; ++ty, ++oy) {
+               auto tp = target.line_pointer(ty);
+               auto op = reinterpret_cast<OtherType*>(other.line_pointer(oy));
+               for (int tx = target.start_x, ox = other.start_x; tx < target.size.width && ox < other.size.width; ++tx, ++ox) {
+                       float const alpha = get(op + 3) / alpha_divisor;
+                       tp[0] = (get(op + blue) / value_divisor) * alpha + tp[0] * (1 - alpha);
+                       tp[1] = (get(op + 1) / value_divisor) * alpha + tp[1] * (1 - alpha);
+                       tp[2] = (get(op + red) / value_divisor) * alpha + tp[2] * (1 - alpha);
+                       tp[3] = (get(op + 3) / value_divisor) * alpha + tp[3] * (1 - alpha);
+
+                       tp += target.bpp;
+                       op += other.bpp / sizeof(OtherType);
+               }
+       }
+}
+
+
+template <class OtherType>
+void
+alpha_blend_onto_rgba(TargetParams const& target, OtherRGBParams const& other, int red, int blue, std::function<float (OtherType*)> get, int value_divisor)
+{
+       auto const alpha_divisor = other.alpha_divisor();
+       for (int ty = target.start_y, oy = other.start_y; ty < target.size.height && oy < other.size.height; ++ty, ++oy) {
+               auto tp = target.line_pointer(ty);
+               auto op = reinterpret_cast<OtherType*>(other.line_pointer(oy));
+               for (int tx = target.start_x, ox = other.start_x; tx < target.size.width && ox < other.size.width; ++tx, ++ox) {
+                       float const alpha = get(op + 3) / alpha_divisor;
+                       tp[0] = (get(op + red) / value_divisor) * alpha + tp[0] * (1 - alpha);
+                       tp[1] = (get(op + 1) / value_divisor) * alpha + tp[1] * (1 - alpha);
+                       tp[2] = (get(op + blue) / value_divisor) * alpha + tp[2] * (1 - alpha);
+                       tp[3] = (get(op + 3) / value_divisor) * alpha + tp[3] * (1 - alpha);
+
+                       tp += target.bpp;
+                       op += other.bpp / sizeof(OtherType);
+               }
+       }
+}
+
+
+template <class OtherType>
+void
+alpha_blend_onto_rgb48le(TargetParams const& target, OtherRGBParams const& other, int red, int blue, std::function<float (OtherType*)> get, int value_scale)
+{
+       auto const alpha_divisor = other.alpha_divisor();
+       for (int ty = target.start_y, oy = other.start_y; ty < target.size.height && oy < other.size.height; ++ty, ++oy) {
+               auto tp = reinterpret_cast<uint16_t*>(target.line_pointer(ty));
+               auto op = reinterpret_cast<OtherType*>(other.line_pointer(oy));
+               for (int tx = target.start_x, ox = other.start_x; tx < target.size.width && ox < other.size.width; ++tx, ++ox) {
+                       float const alpha = get(op + 3) / alpha_divisor;
+                       tp[0] = get(op + red) * value_scale * alpha + tp[0] * (1 - alpha);
+                       tp[1] = get(op + 1) * value_scale * alpha + tp[1] * (1 - alpha);
+                       tp[2] = get(op + blue) * value_scale * alpha + tp[2] * (1 - alpha);
+
+                       tp += target.bpp / 2;
+                       op += other.bpp / sizeof(OtherType);
+               }
+       }
+}
+
+
+template <class OtherType>
+void
+alpha_blend_onto_xyz12le(TargetParams const& target, OtherRGBParams const& other, int red, int blue, std::function<float (OtherType*)> get, int value_divisor)
+{
+       auto const alpha_divisor = other.alpha_divisor();
+       auto conv = dcp::ColourConversion::srgb_to_xyz();
+       double fast_matrix[9];
+       dcp::combined_rgb_to_xyz(conv, fast_matrix);
+       auto lut_in = conv.in()->double_lut(0, 1, 8, false);
+       auto lut_out = conv.out()->int_lut(0, 1, 16, true, 65535);
+       for (int ty = target.start_y, oy = other.start_y; ty < target.size.height && oy < other.size.height; ++ty, ++oy) {
+               auto tp = reinterpret_cast<uint16_t*>(target.data[0] + ty * target.stride[0] + target.start_x * target.bpp);
+               auto op = reinterpret_cast<OtherType*>(other.data[0] + oy * other.stride[0]);
+               for (int tx = target.start_x, ox = other.start_x; tx < target.size.width && ox < other.size.width; ++tx, ++ox) {
+                       float const alpha = get(op + 3) / alpha_divisor;
+
+                       /* Convert sRGB to XYZ; op is BGRA.  First, input gamma LUT */
+                       double const r = lut_in[get(op + red) / value_divisor];
+                       double const g = lut_in[get(op + 1) / value_divisor];
+                       double const b = lut_in[get(op + blue) / value_divisor];
+
+                       /* RGB to XYZ, including Bradford transform and DCI companding */
+                       double const x = max(0.0, min(1.0, r * fast_matrix[0] + g * fast_matrix[1] + b * fast_matrix[2]));
+                       double const y = max(0.0, min(1.0, r * fast_matrix[3] + g * fast_matrix[4] + b * fast_matrix[5]));
+                       double const z = max(0.0, min(1.0, r * fast_matrix[6] + g * fast_matrix[7] + b * fast_matrix[8]));
+
+                       /* Out gamma LUT and blend */
+                       tp[0] = lut_out[lrint(x * 65535)] * alpha + tp[0] * (1 - alpha);
+                       tp[1] = lut_out[lrint(y * 65535)] * alpha + tp[1] * (1 - alpha);
+                       tp[2] = lut_out[lrint(z * 65535)] * alpha + tp[2] * (1 - alpha);
+
+                       tp += target.bpp / 2;
+                       op += other.bpp / sizeof(OtherType);
+               }
+       }
+}
+
+
+static
+void
+alpha_blend_onto_yuv420p(TargetParams const& target, OtherYUVParams const& other, std::function<float (uint8_t* data)> get_alpha)
+{
+       auto const ts = target.size;
+       auto const os = other.size;
+       for (int ty = target.start_y, oy = other.start_y; ty < ts.height && oy < os.height; ++ty, ++oy) {
+               int const hty = ty / 2;
+               int const hoy = oy / 2;
+               uint8_t* tY = target.data[0] + (ty * target.stride[0]) + target.start_x;
+               uint8_t* tU = target.data[1] + (hty * target.stride[1]) + target.start_x / 2;
+               uint8_t* tV = target.data[2] + (hty * target.stride[2]) + target.start_x / 2;
+               uint8_t* oY = other.data[0] + (oy * other.stride[0]) + other.start_x;
+               uint8_t* oU = other.data[1] + (hoy * other.stride[1]) + other.start_x / 2;
+               uint8_t* oV = other.data[2] + (hoy * other.stride[2]) + other.start_x / 2;
+               uint8_t* alpha = other.alpha_data[0] + (oy * other.alpha_stride[0]) + other.start_x * other.alpha_bpp;
+               for (int tx = target.start_x, ox = other.start_x; tx < ts.width && ox < os.width; ++tx, ++ox) {
+                       float const a = get_alpha(alpha);
+                       *tY = *oY * a + *tY * (1 - a);
+                       *tU = *oU * a + *tU * (1 - a);
+                       *tV = *oV * a + *tV * (1 - a);
+                       ++tY;
+                       ++oY;
+                       if (tx % 2) {
+                               ++tU;
+                               ++tV;
+                       }
+                       if (ox % 2) {
+                               ++oU;
+                               ++oV;
+                       }
+                       alpha += other.alpha_bpp;
+               }
+       }
+}
+
+
+static
+void
+alpha_blend_onto_yuv420p10(TargetParams const& target, OtherYUVParams const& other, std::function<float (uint8_t* data)> get_alpha)
+{
+       auto const ts = target.size;
+       auto const os = other.size;
+       for (int ty = target.start_y, oy = other.start_y; ty < ts.height && oy < os.height; ++ty, ++oy) {
+               int const hty = ty / 2;
+               int const hoy = oy / 2;
+               uint16_t* tY = reinterpret_cast<uint16_t*>(target.data[0] + (ty * target.stride[0])) + target.start_x;
+               uint16_t* tU = reinterpret_cast<uint16_t*>(target.data[1] + (hty * target.stride[1])) + target.start_x / 2;
+               uint16_t* tV = reinterpret_cast<uint16_t*>(target.data[2] + (hty * target.stride[2])) + target.start_x / 2;
+               uint16_t* oY = reinterpret_cast<uint16_t*>(other.data[0] + (oy * other.stride[0])) + other.start_x;
+               uint16_t* oU = reinterpret_cast<uint16_t*>(other.data[1] + (hoy * other.stride[1])) + other.start_x / 2;
+               uint16_t* oV = reinterpret_cast<uint16_t*>(other.data[2] + (hoy * other.stride[2])) + other.start_x / 2;
+               uint8_t* alpha = other.alpha_data[0] + (oy * other.alpha_stride[0]) + other.start_x * other.alpha_bpp;
+               for (int tx = target.start_x, ox = other.start_x; tx < ts.width && ox < os.width; ++tx, ++ox) {
+                       float const a = get_alpha(alpha);
+                       *tY = *oY * a + *tY * (1 - a);
+                       *tU = *oU * a + *tU * (1 - a);
+                       *tV = *oV * a + *tV * (1 - a);
+                       ++tY;
+                       ++oY;
+                       if (tx % 2) {
+                               ++tU;
+                               ++tV;
+                       }
+                       if (ox % 2) {
+                               ++oU;
+                               ++oV;
+                       }
+                       alpha += other.alpha_bpp;
+               }
+       }
+}
+
+
+static
+void
+alpha_blend_onto_yuv422p9or10le(TargetParams const& target, OtherYUVParams const& other, std::function<float (uint8_t* data)> get_alpha)
+{
+       auto const ts = target.size;
+       auto const os = other.size;
+       for (int ty = target.start_y, oy = other.start_y; ty < ts.height && oy < os.height; ++ty, ++oy) {
+               uint16_t* tY = reinterpret_cast<uint16_t*>(target.data[0] + (ty * target.stride[0])) + target.start_x;
+               uint16_t* tU = reinterpret_cast<uint16_t*>(target.data[1] + (ty * target.stride[1])) + target.start_x / 2;
+               uint16_t* tV = reinterpret_cast<uint16_t*>(target.data[2] + (ty * target.stride[2])) + target.start_x / 2;
+               uint16_t* oY = reinterpret_cast<uint16_t*>(other.data[0] + (oy * other.stride[0])) + other.start_x;
+               uint16_t* oU = reinterpret_cast<uint16_t*>(other.data[1] + (oy * other.stride[1])) + other.start_x / 2;
+               uint16_t* oV = reinterpret_cast<uint16_t*>(other.data[2] + (oy * other.stride[2])) + other.start_x / 2;
+               uint8_t* alpha = other.alpha_data[0] + (oy * other.alpha_stride[0]) + other.start_x * other.alpha_bpp;
+               for (int tx = target.start_x, ox = other.start_x; tx < ts.width && ox < os.width; ++tx, ++ox) {
+                       float const a = get_alpha(alpha);
+                       *tY = *oY * a + *tY * (1 - a);
+                       *tU = *oU * a + *tU * (1 - a);
+                       *tV = *oV * a + *tV * (1 - a);
+                       ++tY;
+                       ++oY;
+                       if (tx % 2) {
+                               ++tU;
+                               ++tV;
+                       }
+                       if (ox % 2) {
+                               ++oU;
+                               ++oV;
+                       }
+                       alpha += other.alpha_bpp;
+               }
+       }
+}
+
+
+static
+void
+alpha_blend_onto_yuv444p9or10le(TargetParams const& target, OtherYUVParams const& other, std::function<float (uint8_t* data)> get_alpha)
+{
+       auto const ts = target.size;
+       auto const os = other.size;
+       for (int ty = target.start_y, oy = other.start_y; ty < ts.height && oy < os.height; ++ty, ++oy) {
+               uint16_t* tY = reinterpret_cast<uint16_t*>(target.data[0] + (ty * target.stride[0])) + target.start_x;
+               uint16_t* tU = reinterpret_cast<uint16_t*>(target.data[1] + (ty * target.stride[1])) + target.start_x;
+               uint16_t* tV = reinterpret_cast<uint16_t*>(target.data[2] + (ty * target.stride[2])) + target.start_x;
+               uint16_t* oY = reinterpret_cast<uint16_t*>(other.data[0] + (oy * other.stride[0])) + other.start_x;
+               uint16_t* oU = reinterpret_cast<uint16_t*>(other.data[1] + (oy * other.stride[1])) + other.start_x;
+               uint16_t* oV = reinterpret_cast<uint16_t*>(other.data[2] + (oy * other.stride[2])) + other.start_x;
+               uint8_t* alpha = other.alpha_data[0] + (oy * other.alpha_stride[0]) + other.start_x * other.alpha_bpp;
+               for (int tx = target.start_x, ox = other.start_x; tx < ts.width && ox < os.width; ++tx, ++ox) {
+                       float const a = get_alpha(alpha);
+                       *tY = *oY * a + *tY * (1 - a);
+                       *tU = *oU * a + *tU * (1 - a);
+                       *tV = *oV * a + *tV * (1 - a);
+                       ++tY;
+                       ++oY;
+                       ++tU;
+                       ++tV;
+                       ++oU;
+                       ++oV;
+                       alpha += other.alpha_bpp;
+               }
+       }
+}
+
+
 void
 Image::alpha_blend (shared_ptr<const Image> other, Position<int> position)
 {
-       /* We're blending RGBA or BGRA images */
-       DCPOMATIC_ASSERT (other->pixel_format() == AV_PIX_FMT_BGRA || other->pixel_format() == AV_PIX_FMT_RGBA);
+       DCPOMATIC_ASSERT(
+               other->pixel_format() == AV_PIX_FMT_BGRA ||
+               other->pixel_format() == AV_PIX_FMT_RGBA ||
+               other->pixel_format() == AV_PIX_FMT_RGBA64BE
+               );
+
        int const blue = other->pixel_format() == AV_PIX_FMT_BGRA ? 0 : 2;
        int const red = other->pixel_format() == AV_PIX_FMT_BGRA ? 2 : 0;
 
-       int const other_bpp = 4;
-
        int start_tx = position.x;
        int start_ox = 0;
 
@@ -516,218 +976,147 @@ Image::alpha_blend (shared_ptr<const Image> other, Position<int> position)
                start_ty = 0;
        }
 
+       TargetParams target_params = {
+               start_tx,
+               start_ty,
+               size(),
+               data(),
+               stride(),
+               0
+       };
+
+       OtherRGBParams other_rgb_params = {
+               start_ox,
+               start_oy,
+               other->size(),
+               other->data(),
+               other->stride(),
+               other->pixel_format() == AV_PIX_FMT_RGBA64BE ? 8 : 4
+       };
+
+       OtherYUVParams other_yuv_params = {
+               start_ox,
+               start_oy,
+               other->size(),
+               other->data(),
+               other->stride(),
+               nullptr,
+               nullptr,
+               other->pixel_format() == AV_PIX_FMT_RGBA64BE ? 8 : 4
+       };
+
+       auto byteswap = [](uint16_t* p) {
+               return (*p >> 8) | ((*p & 0xff) << 8);
+       };
+
+       auto pass = [](uint8_t* p) {
+               return *p;
+       };
+
+       auto get_alpha_64be = [](uint8_t* p) {
+               return ((static_cast<int16_t>(p[6]) << 8) | p[7]) / 65535.0f;
+       };
+
+       auto get_alpha_byte = [](uint8_t* p) {
+               return p[3] / 255.0f;
+       };
+
        switch (_pixel_format) {
        case AV_PIX_FMT_RGB24:
-       {
-               /* Going onto RGB24.  First byte is red, second green, third blue */
-               int const this_bpp = 3;
-               for (int ty = start_ty, oy = start_oy; ty < size().height && oy < other->size().height; ++ty, ++oy) {
-                       uint8_t* tp = data()[0] + ty * stride()[0] + start_tx * this_bpp;
-                       uint8_t* op = other->data()[0] + oy * other->stride()[0];
-                       for (int tx = start_tx, ox = start_ox; tx < size().width && ox < other->size().width; ++tx, ++ox) {
-                               float const alpha = float (op[3]) / 255;
-                               tp[0] = op[red] * alpha + tp[0] * (1 - alpha);
-                               tp[1] = op[1] * alpha + tp[1] * (1 - alpha);
-                               tp[2] = op[blue] * alpha + tp[2] * (1 - alpha);
-
-                               tp += this_bpp;
-                               op += other_bpp;
-                       }
+               target_params.bpp = 3;
+               if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+                       alpha_blend_onto_rgb24<uint16_t>(target_params, other_rgb_params, red, blue, byteswap, 256);
+               } else {
+                       alpha_blend_onto_rgb24<uint8_t>(target_params, other_rgb_params, red, blue, pass, 1);
                }
                break;
-       }
        case AV_PIX_FMT_BGRA:
-       {
-               int const this_bpp = 4;
-               for (int ty = start_ty, oy = start_oy; ty < size().height && oy < other->size().height; ++ty, ++oy) {
-                       uint8_t* tp = data()[0] + ty * stride()[0] + start_tx * this_bpp;
-                       uint8_t* op = other->data()[0] + oy * other->stride()[0];
-                       for (int tx = start_tx, ox = start_ox; tx < size().width && ox < other->size().width; ++tx, ++ox) {
-                               float const alpha = float (op[3]) / 255;
-                               tp[0] = op[blue] * alpha + tp[0] * (1 - alpha);
-                               tp[1] = op[1] * alpha + tp[1] * (1 - alpha);
-                               tp[2] = op[red] * alpha + tp[2] * (1 - alpha);
-                               tp[3] = op[3] * alpha + tp[3] * (1 - alpha);
-
-                               tp += this_bpp;
-                               op += other_bpp;
-                       }
+               target_params.bpp = 4;
+               if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+                       alpha_blend_onto_bgra<uint16_t>(target_params, other_rgb_params, red, blue, byteswap, 256);
+               } else {
+                       alpha_blend_onto_bgra<uint8_t>(target_params, other_rgb_params, red, blue, pass, 1);
                }
                break;
-       }
        case AV_PIX_FMT_RGBA:
-       {
-               int const this_bpp = 4;
-               for (int ty = start_ty, oy = start_oy; ty < size().height && oy < other->size().height; ++ty, ++oy) {
-                       uint8_t* tp = data()[0] + ty * stride()[0] + start_tx * this_bpp;
-                       uint8_t* op = other->data()[0] + oy * other->stride()[0];
-                       for (int tx = start_tx, ox = start_ox; tx < size().width && ox < other->size().width; ++tx, ++ox) {
-                               float const alpha = float (op[3]) / 255;
-                               tp[0] = op[red] * alpha + tp[0] * (1 - alpha);
-                               tp[1] = op[1] * alpha + tp[1] * (1 - alpha);
-                               tp[2] = op[blue] * alpha + tp[2] * (1 - alpha);
-                               tp[3] = op[3] * alpha + tp[3] * (1 - alpha);
-
-                               tp += this_bpp;
-                               op += other_bpp;
-                       }
+               target_params.bpp = 4;
+               if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+                       alpha_blend_onto_rgba<uint16_t>(target_params, other_rgb_params, red, blue, byteswap, 256);
+               } else {
+                       alpha_blend_onto_rgba<uint8_t>(target_params, other_rgb_params, red, blue, pass, 1);
                }
                break;
-       }
        case AV_PIX_FMT_RGB48LE:
-       {
-               int const this_bpp = 6;
-               for (int ty = start_ty, oy = start_oy; ty < size().height && oy < other->size().height; ++ty, ++oy) {
-                       uint8_t* tp = data()[0] + ty * stride()[0] + start_tx * this_bpp;
-                       uint8_t* op = other->data()[0] + oy * other->stride()[0];
-                       for (int tx = start_tx, ox = start_ox; tx < size().width && ox < other->size().width; ++tx, ++ox) {
-                               float const alpha = float (op[3]) / 255;
-                               /* Blend high bytes */
-                               tp[1] = op[red] * alpha + tp[1] * (1 - alpha);
-                               tp[3] = op[1] * alpha + tp[3] * (1 - alpha);
-                               tp[5] = op[blue] * alpha + tp[5] * (1 - alpha);
-
-                               tp += this_bpp;
-                               op += other_bpp;
-                       }
+               target_params.bpp = 6;
+               if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+                       alpha_blend_onto_rgb48le<uint16_t>(target_params, other_rgb_params, red, blue, byteswap, 1);
+               } else {
+                       alpha_blend_onto_rgb48le<uint8_t>(target_params, other_rgb_params, red, blue, pass, 256);
                }
                break;
-       }
        case AV_PIX_FMT_XYZ12LE:
-       {
-               dcp::ColourConversion conv = dcp::ColourConversion::srgb_to_xyz();
-               double fast_matrix[9];
-               dcp::combined_rgb_to_xyz (conv, fast_matrix);
-               double const * lut_in = conv.in()->lut (8, false);
-               double const * lut_out = conv.out()->lut (16, true);
-               int const this_bpp = 6;
-               for (int ty = start_ty, oy = start_oy; ty < size().height && oy < other->size().height; ++ty, ++oy) {
-                       uint16_t* tp = reinterpret_cast<uint16_t*> (data()[0] + ty * stride()[0] + start_tx * this_bpp);
-                       uint8_t* op = other->data()[0] + oy * other->stride()[0];
-                       for (int tx = start_tx, ox = start_ox; tx < size().width && ox < other->size().width; ++tx, ++ox) {
-                               float const alpha = float (op[3]) / 255;
-
-                               /* Convert sRGB to XYZ; op is BGRA.  First, input gamma LUT */
-                               double const r = lut_in[op[red]];
-                               double const g = lut_in[op[1]];
-                               double const b = lut_in[op[blue]];
-
-                               /* RGB to XYZ, including Bradford transform and DCI companding */
-                               double const x = max (0.0, min (65535.0, r * fast_matrix[0] + g * fast_matrix[1] + b * fast_matrix[2]));
-                               double const y = max (0.0, min (65535.0, r * fast_matrix[3] + g * fast_matrix[4] + b * fast_matrix[5]));
-                               double const z = max (0.0, min (65535.0, r * fast_matrix[6] + g * fast_matrix[7] + b * fast_matrix[8]));
-
-                               /* Out gamma LUT and blend */
-                               tp[0] = lrint(lut_out[lrint(x)] * 65535) * alpha + tp[0] * (1 - alpha);
-                               tp[1] = lrint(lut_out[lrint(y)] * 65535) * alpha + tp[1] * (1 - alpha);
-                               tp[2] = lrint(lut_out[lrint(z)] * 65535) * alpha + tp[2] * (1 - alpha);
-
-                               tp += this_bpp / 2;
-                               op += other_bpp;
-                       }
+               target_params.bpp = 6;
+               if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+                       alpha_blend_onto_xyz12le<uint16_t>(target_params, other_rgb_params, red, blue, byteswap, 256);
+               } else {
+                       alpha_blend_onto_xyz12le<uint8_t>(target_params, other_rgb_params, red, blue, pass, 1);
                }
                break;
-       }
        case AV_PIX_FMT_YUV420P:
        {
-               shared_ptr<Image> yuv = other->convert_pixel_format (dcp::YUV_TO_RGB_REC709, _pixel_format, false, false);
-               dcp::Size const ts = size();
-               dcp::Size const os = yuv->size();
-               for (int ty = start_ty, oy = start_oy; ty < ts.height && oy < os.height; ++ty, ++oy) {
-                       int const hty = ty / 2;
-                       int const hoy = oy / 2;
-                       uint8_t* tY = data()[0] + (ty * stride()[0]) + start_tx;
-                       uint8_t* tU = data()[1] + (hty * stride()[1]) + start_tx / 2;
-                       uint8_t* tV = data()[2] + (hty * stride()[2]) + start_tx / 2;
-                       uint8_t* oY = yuv->data()[0] + (oy * yuv->stride()[0]) + start_ox;
-                       uint8_t* oU = yuv->data()[1] + (hoy * yuv->stride()[1]) + start_ox / 2;
-                       uint8_t* oV = yuv->data()[2] + (hoy * yuv->stride()[2]) + start_ox / 2;
-                       uint8_t* alpha = other->data()[0] + (oy * other->stride()[0]) + start_ox * 4;
-                       for (int tx = start_tx, ox = start_ox; tx < ts.width && ox < os.width; ++tx, ++ox) {
-                               float const a = float(alpha[3]) / 255;
-                               *tY = *oY * a + *tY * (1 - a);
-                               *tU = *oU * a + *tU * (1 - a);
-                               *tV = *oV * a + *tV * (1 - a);
-                               ++tY;
-                               ++oY;
-                               if (tx % 2) {
-                                       ++tU;
-                                       ++tV;
-                               }
-                               if (ox % 2) {
-                                       ++oU;
-                                       ++oV;
-                               }
-                               alpha += 4;
-                       }
+               auto yuv = other->convert_pixel_format (dcp::YUVToRGB::REC709, _pixel_format, Alignment::COMPACT, false);
+               other_yuv_params.data = yuv->data();
+               other_yuv_params.stride = yuv->stride();
+               other_yuv_params.alpha_data = other->data();
+               other_yuv_params.alpha_stride = other->stride();
+               if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+                       alpha_blend_onto_yuv420p(target_params, other_yuv_params, get_alpha_64be);
+               } else {
+                       alpha_blend_onto_yuv420p(target_params, other_yuv_params, get_alpha_byte);
                }
                break;
        }
        case AV_PIX_FMT_YUV420P10:
        {
-               shared_ptr<Image> yuv = other->convert_pixel_format (dcp::YUV_TO_RGB_REC709, _pixel_format, false, false);
-               dcp::Size const ts = size();
-               dcp::Size const os = yuv->size();
-               for (int ty = start_ty, oy = start_oy; ty < ts.height && oy < os.height; ++ty, ++oy) {
-                       int const hty = ty / 2;
-                       int const hoy = oy / 2;
-                       uint16_t* tY = ((uint16_t *) (data()[0] + (ty * stride()[0]))) + start_tx;
-                       uint16_t* tU = ((uint16_t *) (data()[1] + (hty * stride()[1]))) + start_tx / 2;
-                       uint16_t* tV = ((uint16_t *) (data()[2] + (hty * stride()[2]))) + start_tx / 2;
-                       uint16_t* oY = ((uint16_t *) (yuv->data()[0] + (oy * yuv->stride()[0]))) + start_ox;
-                       uint16_t* oU = ((uint16_t *) (yuv->data()[1] + (hoy * yuv->stride()[1]))) + start_ox / 2;
-                       uint16_t* oV = ((uint16_t *) (yuv->data()[2] + (hoy * yuv->stride()[2]))) + start_ox / 2;
-                       uint8_t* alpha = other->data()[0] + (oy * other->stride()[0]) + start_ox * 4;
-                       for (int tx = start_tx, ox = start_ox; tx < ts.width && ox < os.width; ++tx, ++ox) {
-                               float const a = float(alpha[3]) / 255;
-                               *tY = *oY * a + *tY * (1 - a);
-                               *tU = *oU * a + *tU * (1 - a);
-                               *tV = *oV * a + *tV * (1 - a);
-                               ++tY;
-                               ++oY;
-                               if (tx % 2) {
-                                       ++tU;
-                                       ++tV;
-                               }
-                               if (ox % 2) {
-                                       ++oU;
-                                       ++oV;
-                               }
-                               alpha += 4;
-                       }
+               auto yuv = other->convert_pixel_format (dcp::YUVToRGB::REC709, _pixel_format, Alignment::COMPACT, false);
+               other_yuv_params.data = yuv->data();
+               other_yuv_params.stride = yuv->stride();
+               other_yuv_params.alpha_data = other->data();
+               other_yuv_params.alpha_stride = other->stride();
+               if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+                       alpha_blend_onto_yuv420p10(target_params, other_yuv_params, get_alpha_64be);
+               } else {
+                       alpha_blend_onto_yuv420p10(target_params, other_yuv_params, get_alpha_byte);
                }
                break;
        }
+       case AV_PIX_FMT_YUV422P9LE:
        case AV_PIX_FMT_YUV422P10LE:
        {
-               shared_ptr<Image> yuv = other->convert_pixel_format (dcp::YUV_TO_RGB_REC709, _pixel_format, false, false);
-               dcp::Size const ts = size();
-               dcp::Size const os = yuv->size();
-               for (int ty = start_ty, oy = start_oy; ty < ts.height && oy < os.height; ++ty, ++oy) {
-                       uint16_t* tY = ((uint16_t *) (data()[0] + (ty * stride()[0]))) + start_tx;
-                       uint16_t* tU = ((uint16_t *) (data()[1] + (ty * stride()[1]))) + start_tx / 2;
-                       uint16_t* tV = ((uint16_t *) (data()[2] + (ty * stride()[2]))) + start_tx / 2;
-                       uint16_t* oY = ((uint16_t *) (yuv->data()[0] + (oy * yuv->stride()[0]))) + start_ox;
-                       uint16_t* oU = ((uint16_t *) (yuv->data()[1] + (oy * yuv->stride()[1]))) + start_ox / 2;
-                       uint16_t* oV = ((uint16_t *) (yuv->data()[2] + (oy * yuv->stride()[2]))) + start_ox / 2;
-                       uint8_t* alpha = other->data()[0] + (oy * other->stride()[0]) + start_ox * 4;
-                       for (int tx = start_tx, ox = start_ox; tx < ts.width && ox < os.width; ++tx, ++ox) {
-                               float const a = float(alpha[3]) / 255;
-                               *tY = *oY * a + *tY * (1 - a);
-                               *tU = *oU * a + *tU * (1 - a);
-                               *tV = *oV * a + *tV * (1 - a);
-                               ++tY;
-                               ++oY;
-                               if (tx % 2) {
-                                       ++tU;
-                                       ++tV;
-                               }
-                               if (ox % 2) {
-                                       ++oU;
-                                       ++oV;
-                               }
-                               alpha += 4;
-                       }
+               auto yuv = other->convert_pixel_format (dcp::YUVToRGB::REC709, _pixel_format, Alignment::COMPACT, false);
+               other_yuv_params.data = yuv->data();
+               other_yuv_params.stride = yuv->stride();
+               other_yuv_params.alpha_data = other->data();
+               other_yuv_params.alpha_stride = other->stride();
+               if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+                       alpha_blend_onto_yuv422p9or10le(target_params, other_yuv_params, get_alpha_64be);
+               } else {
+                       alpha_blend_onto_yuv422p9or10le(target_params, other_yuv_params, get_alpha_byte);
+               }
+               break;
+       }
+       case AV_PIX_FMT_YUV444P9LE:
+       case AV_PIX_FMT_YUV444P10LE:
+       {
+               auto yuv = other->convert_pixel_format (dcp::YUVToRGB::REC709, _pixel_format, Alignment::COMPACT, false);
+               other_yuv_params.data = yuv->data();
+               other_yuv_params.stride = yuv->stride();
+               other_yuv_params.alpha_data = other->data();
+               other_yuv_params.alpha_stride = other->stride();
+               if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+                       alpha_blend_onto_yuv444p9or10le(target_params, other_yuv_params, get_alpha_64be);
+               } else {
+                       alpha_blend_onto_yuv444p9or10le(target_params, other_yuv_params, get_alpha_byte);
                }
                break;
        }
@@ -736,6 +1125,7 @@ Image::alpha_blend (shared_ptr<const Image> other, Position<int> position)
        }
 }
 
+
 void
 Image::copy (shared_ptr<const Image> other, Position<int> position)
 {
@@ -751,6 +1141,7 @@ Image::copy (shared_ptr<const Image> other, Position<int> position)
        }
 }
 
+
 void
 Image::read_from_socket (shared_ptr<Socket> socket)
 {
@@ -764,6 +1155,7 @@ Image::read_from_socket (shared_ptr<Socket> socket)
        }
 }
 
+
 void
 Image::write_to_socket (shared_ptr<Socket> socket) const
 {
@@ -777,10 +1169,11 @@ Image::write_to_socket (shared_ptr<Socket> socket) const
        }
 }
 
+
 float
 Image::bytes_per_pixel (int c) const
 {
-       AVPixFmtDescriptor const * d = av_pix_fmt_desc_get(_pixel_format);
+       auto d = av_pix_fmt_desc_get(_pixel_format);
        if (!d) {
                throw PixelFormatError ("bytes_per_pixel()", _pixel_format);
        }
@@ -823,21 +1216,23 @@ Image::bytes_per_pixel (int c) const
        return bpp[c];
 }
 
+
 /** Construct a Image of a given size and format, allocating memory
  *  as required.
  *
  *  @param p Pixel format.
  *  @param s Size in pixels.
- *  @param aligned true to make each row of this image aligned to a 32-byte boundary.
+ *  @param alignment PADDED to make each row of this image aligned to a ALIGNMENT-byte boundary, otherwise COMPACT.
  */
-Image::Image (AVPixelFormat p, dcp::Size s, bool aligned)
+Image::Image (AVPixelFormat p, dcp::Size s, Alignment alignment)
        : _size (s)
        , _pixel_format (p)
-       , _aligned (aligned)
+       , _alignment (alignment)
 {
        allocate ();
 }
 
+
 void
 Image::allocate ()
 {
@@ -850,9 +1245,14 @@ Image::allocate ()
        _stride = (int *) wrapped_av_malloc (4 * sizeof (int));
        _stride[0] = _stride[1] = _stride[2] = _stride[3] = 0;
 
+       auto stride_round_up = [](int stride, int t) {
+               int const a = stride + (t - 1);
+               return a - (a % t);
+       };
+
        for (int i = 0; i < planes(); ++i) {
                _line_size[i] = ceil (_size.width * bytes_per_pixel(i));
-               _stride[i] = stride_round_up (i, _line_size, _aligned ? 32 : 1);
+               _stride[i] = stride_round_up (_line_size[i], _alignment == Alignment::PADDED ? ALIGNMENT : 1);
 
                /* The assembler function ff_rgb24ToY_avx (in libswscale/x86/input.asm)
                   uses a 16-byte fetch to read three bytes (R/G/B) of image data.
@@ -865,7 +1265,7 @@ Image::allocate ()
 
                   Further to the above, valgrind is now telling me that ff_rgb24ToY_ssse3
                   over-reads by more then _avx.  I can't follow the code to work out how much,
-                  so I'll just over-allocate by 32 bytes and have done with it.  Empirical
+                  so I'll just over-allocate by ALIGNMENT bytes and have done with it.  Empirical
                   testing suggests that it works.
 
                   In addition to these concerns, we may read/write as much as a whole extra line
@@ -891,21 +1291,22 @@ Image::allocate ()
                   |XXXwrittenXXX|<------line-size------------->|XXXwrittenXXXXXXwrittenXXX
                                                                               ^^^^ out of bounds
                */
-               _data[i] = (uint8_t *) wrapped_av_malloc (_stride[i] * (sample_size(i).height + 1) + 32);
+               _data[i] = (uint8_t *) wrapped_av_malloc (_stride[i] * (sample_size(i).height + 1) + ALIGNMENT);
 #if HAVE_VALGRIND_MEMCHECK_H
                /* The data between the end of the line size and the stride is undefined but processed by
                   libswscale, causing lots of valgrind errors.  Mark it all defined to quell these errors.
                */
-               VALGRIND_MAKE_MEM_DEFINED (_data[i], _stride[i] * (sample_size(i).height + 1) + 32);
+               VALGRIND_MAKE_MEM_DEFINED (_data[i], _stride[i] * (sample_size(i).height + 1) + ALIGNMENT);
 #endif
        }
 }
 
+
 Image::Image (Image const & other)
-       : boost::enable_shared_from_this<Image>(other)
+       : std::enable_shared_from_this<Image>(other)
        , _size (other._size)
        , _pixel_format (other._pixel_format)
-       , _aligned (other._aligned)
+       , _alignment (other._alignment)
 {
        allocate ();
 
@@ -921,11 +1322,14 @@ Image::Image (Image const & other)
        }
 }
 
-Image::Image (AVFrame* frame)
+
+Image::Image (AVFrame const * frame, Alignment alignment)
        : _size (frame->width, frame->height)
-       , _pixel_format (static_cast<AVPixelFormat> (frame->format))
-       , _aligned (true)
+       , _pixel_format (static_cast<AVPixelFormat>(frame->format))
+       , _alignment (alignment)
 {
+       DCPOMATIC_ASSERT (_pixel_format != AV_PIX_FMT_NONE);
+
        allocate ();
 
        for (int i = 0; i < planes(); ++i) {
@@ -941,10 +1345,11 @@ Image::Image (AVFrame* frame)
        }
 }
 
-Image::Image (shared_ptr<const Image> other, bool aligned)
+
+Image::Image (shared_ptr<const Image> other, Alignment alignment)
        : _size (other->_size)
        , _pixel_format (other->_pixel_format)
-       , _aligned (aligned)
+       , _alignment (alignment)
 {
        allocate ();
 
@@ -961,6 +1366,7 @@ Image::Image (shared_ptr<const Image> other, bool aligned)
        }
 }
 
+
 Image&
 Image::operator= (Image const & other)
 {
@@ -973,6 +1379,7 @@ Image::operator= (Image const & other)
        return *this;
 }
 
+
 void
 Image::swap (Image & other)
 {
@@ -985,9 +1392,10 @@ Image::swap (Image & other)
                std::swap (_stride[i], other._stride[i]);
        }
 
-       std::swap (_aligned, other._aligned);
+       std::swap (_alignment, other._alignment);
 }
 
+
 Image::~Image ()
 {
        for (int i = 0; i < planes(); ++i) {
@@ -999,65 +1407,73 @@ Image::~Image ()
        av_free (_stride);
 }
 
+
 uint8_t * const *
 Image::data () const
 {
        return _data;
 }
 
+
 int const *
 Image::line_size () const
 {
        return _line_size;
 }
 
+
 int const *
 Image::stride () const
 {
        return _stride;
 }
 
+
 dcp::Size
 Image::size () const
 {
        return _size;
 }
 
-bool
-Image::aligned () const
+
+Image::Alignment
+Image::alignment () const
 {
-       return _aligned;
+       return _alignment;
 }
 
+
 PositionImage
-merge (list<PositionImage> images)
+merge (list<PositionImage> images, Image::Alignment alignment)
 {
        if (images.empty ()) {
-               return PositionImage ();
+               return {};
        }
 
        if (images.size() == 1) {
-               return images.front ();
+               images.front().image = Image::ensure_alignment(images.front().image, alignment);
+               return images.front();
        }
 
        dcpomatic::Rect<int> all (images.front().position, images.front().image->size().width, images.front().image->size().height);
-       for (list<PositionImage>::const_iterator i = images.begin(); i != images.end(); ++i) {
-               all.extend (dcpomatic::Rect<int> (i->position, i->image->size().width, i->image->size().height));
+       for (auto const& i: images) {
+               all.extend (dcpomatic::Rect<int>(i.position, i.image->size().width, i.image->size().height));
        }
 
-       shared_ptr<Image> merged (new Image (images.front().image->pixel_format (), dcp::Size (all.width, all.height), true));
+       auto merged = make_shared<Image>(images.front().image->pixel_format(), dcp::Size(all.width, all.height), alignment);
        merged->make_transparent ();
-       for (list<PositionImage>::const_iterator i = images.begin(); i != images.end(); ++i) {
-               merged->alpha_blend (i->image, i->position - all.position());
+       for (auto const& i: images) {
+               merged->alpha_blend (i.image, i.position - all.position());
        }
 
        return PositionImage (merged, all.position ());
 }
 
+
 bool
 operator== (Image const & a, Image const & b)
 {
-       if (a.planes() != b.planes() || a.pixel_format() != b.pixel_format() || a.aligned() != b.aligned()) {
+       if (a.planes() != b.planes() || a.pixel_format() != b.pixel_format() || a.alignment() != b.alignment()) {
                return false;
        }
 
@@ -1082,6 +1498,7 @@ operator== (Image const & a, Image const & b)
        return true;
 }
 
+
 /** Fade the image.
  *  @param f Amount to fade by; 0 is black, 1 is no fade.
  */
@@ -1202,16 +1619,18 @@ Image::fade (float f)
        }
 }
 
+
 shared_ptr<const Image>
-Image::ensure_aligned (shared_ptr<const Image> image)
+Image::ensure_alignment (shared_ptr<const Image> image, Image::Alignment alignment)
 {
-       if (image->aligned()) {
+       if (image->alignment() == alignment) {
                return image;
        }
 
-       return shared_ptr<Image> (new Image (image, true));
+       return make_shared<Image>(image, alignment);
 }
 
+
 size_t
 Image::memory_used () const
 {
@@ -1222,103 +1641,6 @@ Image::memory_used () const
        return m;
 }
 
-class Memory
-{
-public:
-       Memory ()
-               : data(0)
-               , size(0)
-       {}
-
-       ~Memory ()
-       {
-               free (data);
-       }
-
-       uint8_t* data;
-       size_t size;
-};
-
-static void
-png_write_data (png_structp png_ptr, png_bytep data, png_size_t length)
-{
-       Memory* mem = reinterpret_cast<Memory*>(png_get_io_ptr(png_ptr));
-       size_t size = mem->size + length;
-
-       if (mem->data) {
-               mem->data = reinterpret_cast<uint8_t*>(realloc(mem->data, size));
-       } else {
-               mem->data = reinterpret_cast<uint8_t*>(malloc(size));
-       }
-
-       if (!mem->data) {
-               throw EncodeError (N_("could not allocate memory for PNG"));
-       }
-
-       memcpy (mem->data + mem->size, data, length);
-       mem->size += length;
-}
-
-static void
-png_flush (png_structp)
-{
-
-}
-
-static void
-png_error_fn (png_structp png_ptr, char const * message)
-{
-       reinterpret_cast<Image*>(png_get_error_ptr(png_ptr))->png_error (message);
-}
-
-void
-Image::png_error (char const * message)
-{
-       throw EncodeError (String::compose ("Error during PNG write: %1", message));
-}
-
-dcp::ArrayData
-Image::as_png () const
-{
-       DCPOMATIC_ASSERT (bytes_per_pixel(0) == 4);
-       DCPOMATIC_ASSERT (planes() == 1);
-       if (pixel_format() != AV_PIX_FMT_RGBA) {
-               return convert_pixel_format(dcp::YUV_TO_RGB_REC709, AV_PIX_FMT_RGBA, true, false)->as_png();
-       }
-
-       /* error handling? */
-       png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, reinterpret_cast<void*>(const_cast<Image*>(this)), png_error_fn, 0);
-       if (!png_ptr) {
-               throw EncodeError (N_("could not create PNG write struct"));
-       }
-
-       Memory state;
-
-       png_set_write_fn (png_ptr, &state, png_write_data, png_flush);
-
-       png_infop info_ptr = png_create_info_struct(png_ptr);
-       if (!info_ptr) {
-               png_destroy_write_struct (&png_ptr, &info_ptr);
-               throw EncodeError (N_("could not create PNG info struct"));
-       }
-
-       png_set_IHDR (png_ptr, info_ptr, size().width, size().height, 8, PNG_COLOR_TYPE_RGBA, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
-
-       png_byte ** row_pointers = reinterpret_cast<png_byte **>(png_malloc(png_ptr, size().height * sizeof(png_byte *)));
-       for (int i = 0; i < size().height; ++i) {
-               row_pointers[i] = (png_byte *) (data()[0] + i * stride()[0]);
-       }
-
-       png_write_info (png_ptr, info_ptr);
-       png_write_image (png_ptr, row_pointers);
-       png_write_end (png_ptr, info_ptr);
-
-       png_destroy_write_struct (&png_ptr, &info_ptr);
-       png_free (png_ptr, row_pointers);
-
-       return dcp::ArrayData (state.data, state.size);
-}
-
 
 void
 Image::video_range_to_full_range ()
@@ -1332,13 +1654,29 @@ Image::video_range_to_full_range ()
                for (int y = 0; y < lines; ++y) {
                        uint8_t* q = p;
                        for (int x = 0; x < line_size()[0]; ++x) {
-                               *q = int((*q - 16) * factor);
+                               *q = clamp(lrintf((*q - 16) * factor), 0L, 255L);
                                ++q;
                        }
                        p += stride()[0];
                }
                break;
        }
+       case AV_PIX_FMT_RGB48LE:
+       {
+               float const factor = 65536.0 / 56064.0;
+               uint16_t* p = reinterpret_cast<uint16_t*>(data()[0]);
+               int const lines = sample_size(0).height;
+               for (int y = 0; y < lines; ++y) {
+                       uint16_t* q = p;
+                       int const line_size_pixels = line_size()[0] / 2;
+                       for (int x = 0; x < line_size_pixels; ++x) {
+                               *q = clamp(lrintf((*q - 4096) * factor), 0L, 65535L);
+                               ++q;
+                       }
+                       p += stride()[0] / 2;
+               }
+               break;
+       }
        case AV_PIX_FMT_GBRP12LE:
        {
                float const factor = 4096.0 / 3504.0;
@@ -1349,7 +1687,7 @@ Image::video_range_to_full_range ()
                                uint16_t* q = p;
                                int const line_size_pixels = line_size()[c] / 2;
                                for (int x = 0; x < line_size_pixels; ++x) {
-                                       *q = int((*q - 256) * factor);
+                                       *q = clamp(lrintf((*q - 256) * factor), 0L, 4095L);
                                        ++q;
                                }
                        }