Index: libmpcodecs/vf_overlay.c =================================================================== --- libmpcodecs/vf_overlay.c (revision 0) +++ libmpcodecs/vf_overlay.c (revision 0) @@ -0,0 +1,1457 @@ +/* Copyright 2007 Jason Tackaberry + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +/** + * \file vf_overlay.c + * + * \brief Shared memory image overlay with alpha compositing. + * + * See DOCS/tech/vf_overlay.txt for full documentation. + */ + +#include "config.h" + +#ifdef HAVE_SHM + +#include +#include +#include +#ifdef HAVE_MALLOC_H +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_SYS_MMAN_H +#include +#endif +#include + +#include "mp_msg.h" +#include "libvo/fastmemcpy.h" +#include "libvo/video_out.h" +#include "libswscale/swscale.h" +#include "input/input.h" +#include "osdep/timer.h" +#include "cpudetect.h" +#include "mangle.h" + +#include "mp_image.h" +#include "vf.h" +#include "img_format.h" +#include "libavutil/avutil.h" +#include "vf_scale.h" + + +/// If defined will output some timing data. Useful for profiling. +//#define STOPWATCH 8 +/// Turn off MMX for debugging. +//#undef HAVE_MMX + + +/// \name Convenience macros. +//@{ +#define C64(x) ((uint64_t)((x)|(x)<<16))<<32 | (uint64_t)(x) | (uint64_t)(x)<<16 +#define clamp(a,min,max) (((a)>(max))?(max):(((a)<(min))?(min):(a))) +//@} + + +/** \name Overlay image buffer lock flags + * \brief Lock flags for controlling the state of the overlay shmem buffer. + * + * The first byte of the overlay shared memory buffer is set to one of the + * following: + */ +//@{ +/** Overlay buffer is available for writing. vf_overlay sets this flag when it + * is finished reading from the shared buffer and when it first initializes. + */ +#define BUFFER_UNLOCKED 0x10 +/** Overlay buffer is locked. Application sets this flag and mustn't write to + * the buffer again until vf_overlay clears it by setting BUFFER_UNLOCKED. + */ +#define BUFFER_LOCKED 0x20 +//@} + + + +/** \name Rectangle invalidation type flags. + * \brief Indicates the type of invalidation that has occurred for a + * given region. + * + * When the Overlay buffer is updated by a client, the invalidation type will + * be RECT_CONVERT | RECT_PREMULTIPLY. When only the global alpha is changed, + * however, there is no need to do a colorspace conversion, but alpha pre- + * multiplication is needed, so in that event, only RECT_PREMULTIPLY is + * used. + */ +//@{ +/// Region of the overlay buffer requires conversion to YV12A. +#define RECT_CONVERT 0x01 +/// Region requires alpha premultiplication. +#define RECT_PREMULTIPLY 0x02 +//@} + + + +/** Singly linked list of rectangles, specified by left/top coordinate and + * width/height. + */ +struct rects { + int x, y, w, h, type; + struct rects *next; +}; + + +/// Per-instance private data. +struct vf_priv_s { + // Memory allocated by alloc_overlay_data() for the YV12 converted state of + // the overlay image. + uint8_t *y, ///< Luma plane + *u, ///< Chroma (Cb) plane + *v, ///< Chroma (Cr) plane + *a, ///< Alpha plane for luma channel + *uva, ///< Alpha plane for chroma channel + *pre_y, ///< Luma plane with pre-alpha-multiplied pixels + *pre_u, ///< Chroma (Cb) plane with pre-alpha-multiplied pixels + *pre_v, ///< Chroma (Cr) plane with pre-alpha-multiplied pixels + *pre_a, /**< Alpha plane for luma channel where pixels are + averaged with global alpha */ + *pre_uva; /**< Alpha plane for chroma channel where pixels are + averaged with global alpha */ + + /** Lockbyte points to the first byte of the shared memory buffer which + * is used for synchronization. See \a BUFFER_LOCKED and \a + * BUFFER_UNLOCKED flags above. + */ + volatile uint8_t *lockbyte; + /// Points to the BGRA image in shared memory (which is simply (lockbyte+16). + uint8_t *bgra_imgbuf; + /// BGR24 version of \a bgra_imgbuf + uint8_t *bgr24_imgbuf; + /// The alpha plane of \a bgra_imgbuf + uint8_t *alpha_imgbuf; + + /// The shared memory id as gotten from shm_get(). + int shm_id; + /// The shared memory key as given to vf_config (via command line). + key_t shm_key; + + int w, ///< Width of the overlay image (pre-scaled, display size) + h, ///< Height of the overlay image + mpi_w, ///< Width of the scaled overlay image (scaled to fit mpi) + mpi_h, ///< Height of the scaled overlay image + mpi_stride, ///< Stride of the scaled overlay image + slice_y, ///< Y-offset of overlay where compositing will begin + slice_h, ///< Height of the overlay buffer to composite + auto_slice, ///< 1 if slice region should be calculated automatically + alpha, ///< "Global" alpha level of overlay (0 <= alpha <= 256) + visible, ///< Whether or not the overlay is visible + dirty, ///< Whether or not the overlay has changed. + is_paused; ///< 1 if the video is paused, 0 otherwise. + + /** List of regions in the overlay image which have been updated and therefore + * need to be converted from BGRA to YV12. + */ + struct rects *invalid_rects; + /** Keep track of last update timestamp; we update the overlay up to about + * 30 times a second. + */ + unsigned int last_update_time; + struct SwsContext *sws_bgr24, ///< Scaler for BGR24 to YV12 + *sws_y800_l, ///< Scaler for luma alpha plane + *sws_y800_c; ///< Scaler for chroma alpha plane + /// The last mpi that was given to put_image(). + mp_image_t *last_mpi; +}; + + + +/** + * \brief Array of all vf_overlay instances private data. + * + * Keep track of filter instances private data because the overlay buffer + * should be able to survive a loadfile or loop, so when the filter is + * initialized, we first check to see if we have an existing filter + * associated with the specified shared memory key and use that instead. + * + * As a result, vf_overlay instances are "persistent" (i.e., they don't get + * uninitialized). Consequently, the global variables below apply to all + * vf_overlay instances. + */ +static struct vf_priv_s **vf_overlay_priv = NULL; +/// Number of vf_overlay instances +static int num_instances = 0; + +#ifdef HAVE_MMX +static uint64_t attribute_used __attribute__((aligned(8))) MM_global_alpha; +static uint64_t attribute_used __attribute__((aligned(8))) MM_ROUND = C64(0x80); +#endif + +#ifdef STOPWATCH +/** + * \brief Simple timer for profiling and debugging. + * + * \param n Identifier of this stopwatch, where 0 < n < 10. + * \param text NULL to start the stopwatch, and a printf-style formatted + * string to stop the stopwatch. + */ +static void +stopwatch(int n, char *text, ...) +{ + va_list ap; + static struct { + unsigned int time, last_time; + char text[250]; + } t[10]; + + if (n > STOPWATCH) + return; + + t[n].time = GetTimer(); + if (!text) { + fprintf(stderr, "@@@ Stopwatch (%d): %s: %d usec\n", n, t[n].text, + t[n].time - t[n].last_time); + } else { + t[n].last_time = t[n].time; + + va_start(ap, text); + vsprintf(t[n].text, text, ap); + va_end(ap); + } +} +#else +#define stopwatch(n, text, ...) +#endif + + + +/** + * \brief Allocate buffer to hold YV12 version of the overlay image. + * + * \param priv Private data for this filter instance. + * + * \return 1 if allocation was successful, or 0 otherwise. + * + * priv->buffer is allocated based on the requested overlay width and height + * with all bytes set to 0. + */ +static int +alloc_overlay_data(struct vf_priv_s *priv) +{ + int w = priv->mpi_stride, h = priv->mpi_h; + + // Boilerplate + #define alloc(buf, size) \ + buf = (uint8_t *)memalign(16, size); \ + if (!buf) return 0 + + alloc(priv->y, w * h); + alloc(priv->u, w * h / 4); + alloc(priv->v, w * h / 4); + alloc(priv->a, w * h); + alloc(priv->uva, w * h / 4); + + // Buffers for alpha-multiplied pixels + alloc(priv->pre_y, w * h); + alloc(priv->pre_u, w * h / 4); + alloc(priv->pre_v, w * h / 4); + alloc(priv->pre_a, w * h); + alloc(priv->pre_uva, w * h / 4); + + alloc(priv->alpha_imgbuf, priv->w * priv->h); + + /* Holds BGR24 version of the image buffer. We hold one extra byte + * because in convert_bgra_to_yv12a() we copy 4 bytes at a time, but + * offset only 3 bytes. This is faster than 3 (or 2) copies, but means + * we need an extra byte so we're staying within the allocated buffer. + */ + alloc(priv->bgr24_imgbuf, priv->w * priv->h * 3 + 1); + return 1; +} + + + +/** + * \brief Detach and delete shared memory segment and free overlay buffers. + * + * \param priv Private data for this filter instance. + */ +static void +free_overlay_data(struct vf_priv_s *priv) +{ + struct shmid_ds shmemds; + + // Boilerplate + #define dealloc(buf) \ + if (buf) { \ + free(buf); \ + buf = 0; \ + } + + dealloc(priv->y); + dealloc(priv->u); + dealloc(priv->v); + dealloc(priv->a); + dealloc(priv->uva); + dealloc(priv->pre_y); + dealloc(priv->pre_u); + dealloc(priv->pre_v); + dealloc(priv->pre_a); + dealloc(priv->pre_uva); + + dealloc(priv->alpha_imgbuf); + dealloc(priv->bgr24_imgbuf); + + if (priv->sws_bgr24) + sws_freeContext(priv->sws_bgr24); + if (priv->sws_y800_l) + sws_freeContext(priv->sws_y800_l); + if (priv->sws_y800_c) + sws_freeContext(priv->sws_y800_c); + priv->sws_bgr24 = priv->sws_y800_l = priv->sws_y800_c = 0; + + if (priv->shm_id > 0 && priv->lockbyte) { + shmctl(priv->shm_id, IPC_RMID, &shmemds); + shmdt((uint8_t *)priv->lockbyte); + priv->lockbyte = 0; + } +} + + + +/** \brief Free all buffers for all overlay filter instances. + * + * Because vf_overlay instances must survive a loadfile or loop, vf_uninit is + * not specified. Therefore, when the first vf_overlay instance is created, + * this function is registered with atexit(3), so that the shared memory + * segment allocated in vf_config is properly deleted. The overlay buffers are + * also freed in the call to free_overlay_data. (Although this is not strictly + * necessary since we are shutting down at this point, it is called for + * correctness.) + */ +static void +free_all_overlay() +{ + int i; + + if (vf_overlay_priv) + for (i = 0; i < num_instances; i++) + free_overlay_data(vf_overlay_priv[i]); +} + + + +/** + * \brief Determines if it's ok for the overlay to update based on a rough + * update rate of about 30 times a second. + * + * \param priv Private data for this filter instance. + * \param set If 1, it means the caller intends to update the overlay if this + * function returns 1, in which case last_update_time is set + * to the current time. + */ +static int +should_update(struct vf_priv_s *priv, int set) +{ + unsigned int time = GetTimerMS(); + + // This results in updates about 30 times a second, give or take. + if (time - priv->last_update_time > 25 || time < priv->last_update_time) { + if (set) + priv->last_update_time = time; + return 1; + } + return 0; +} + + + +/** + * \brief Checks to see if the next filter accepts YV12 images. + */ +static int +query_format(struct vf_instance_s* vf, unsigned int fmt) +{ + if (fmt == IMGFMT_YV12) + return vf_next_query_format(vf, fmt); + return 0; +} + + + +/** + * \brief Configure the filter and call the next filter's config function. + */ +static int +config(struct vf_instance_s* vf, int width, int height, int d_width, int d_height, + unsigned int flags, unsigned int fmt) +{ + struct vf_priv_s *priv = vf->priv; + char *accel_str; + uint8_t *imgbuf; + int bufsize; + + priv->is_paused = 0; + + if (priv->bgra_imgbuf) { + // Already initialized; doing a loadfile or a loop. + if (priv->w == d_width && priv->h == d_height && priv->mpi_w == width && priv->mpi_h == height) { + mp_msg(MSGT_VFILTER, MSGL_INFO, "overlay: reusing existing buffer (%dx%d BGRA)\n", priv->w, priv->h); + return vf_next_config(vf, width, height, d_width, d_height, flags, fmt); + } + // Overlay size is different, so we need to resize. First free existing + // buffers. + free_overlay_data(priv); + } + + priv->w = (d_width + 1) & ~1; + priv->h = (d_height + 1) & ~1; + // Automatically calculate slice by default. + priv->auto_slice = 1; + priv->mpi_w = (width + 1) & ~1; + priv->mpi_h = (height + 1) & ~1; + priv->mpi_stride = priv->mpi_w; + + if (!alloc_overlay_data(priv)) + return 0; + + priv->sws_bgr24 = sws_getContext(priv->w, priv->h, PIX_FMT_BGR24, priv->mpi_w, priv->mpi_h, + PIX_FMT_YUV420P, get_sws_cpuflags() | SWS_BICUBIC, NULL, NULL, NULL); + priv->sws_y800_l = sws_getContext(priv->w, priv->h, PIX_FMT_GRAY8, priv->mpi_w, priv->mpi_h, + PIX_FMT_GRAY8, get_sws_cpuflags() | SWS_BICUBIC, NULL, NULL, NULL); + priv->sws_y800_c = sws_getContext(priv->w, priv->h, PIX_FMT_GRAY8, priv->mpi_w >> 1, priv->mpi_h >> 1, + PIX_FMT_GRAY8, get_sws_cpuflags() | SWS_BICUBIC, NULL, NULL, NULL); + + // 1 lock byte + 15 padding bytes + 32bpp + bufsize = 16 + priv->w * priv->h * 4; + + priv->shm_id = shmget(priv->shm_key, bufsize, IPC_CREAT | 0600); + if (priv->shm_id < 0) { + mp_msg(MSGT_VFILTER, MSGL_ERR, "overlay: ERROR: unable to open shmem (%d): %s\n", + priv->shm_key, strerror(errno)); + return 0; + } + imgbuf = shmat(priv->shm_id, NULL, 0); + if (!imgbuf) { + mp_msg(MSGT_VFILTER, MSGL_ERR, "overlay: ERROR: couldn't mmap %d bytes from shmem (%d): %s\n", + bufsize, priv->shm_key, strerror(errno)); + return 0; + } + + // Start with overlay hidden. + priv->visible = 0; + priv->alpha = 255; + priv->lockbyte = imgbuf; + priv->bgra_imgbuf = imgbuf + 16; + *priv->lockbyte = BUFFER_UNLOCKED; + + accel_str = "no acceleration"; +#ifdef HAVE_MMX + if (gCpuCaps.hasMMX) + accel_str = "MMX accelerated"; +#endif + + mp_msg(MSGT_VFILTER, MSGL_INFO, "overlay: %dx%d BGRA (frame %dx%d); shmem key: %u; %s.\n", + priv->w, priv->h, width, height, vf->priv->shm_key, accel_str); + + return vf_next_config(vf, priv->mpi_w, priv->mpi_h, priv->w, priv->h, flags, fmt); +} + + + +/** + * \brief Translates coordinates from overlay image to mpi. + * + * \param x Pointer to left coordinate of overlay + * \param y Pointer to top coordinate of overlay + * \param w Pointer to width relative to overlay + * \param h Pointer to height relative to overlay + * + * This function maps the passed coordinates from overlay to mpi. The new + * values are returned through the pointers. Any of the parameters may + * safely be NULL. + */ +static inline void +translate_coords(struct vf_priv_s *priv, int *x, int *y, int *w, int *h) +{ + float xdiff = (float)priv->w / priv->mpi_w, + ydiff = (float)priv->h / priv->mpi_h; + + if (x) *x = (int)((float)*x / xdiff); + if (w) { + *w = (int)((float)*w / xdiff); + if (*w > priv->mpi_w) + *w = priv->mpi_w; + } + if (y) *y = (int)((float)*y / ydiff); + if (h) { + *h = (int)((float)*h / ydiff); + if (*h > priv->mpi_h) + *h = priv->mpi_h; + } + +} + +/** + * \brief Automatically determines slice region. + * + * This function uses the chroma alpha plane to determine the slice region + * for blending the overlay. Rather than blending the entire overlay, only + * the calculated slice is blended. Autoslice is enabled by default, but the + * user may manually specify a slice region, in which case this function will + * not be called. + */ +static void +calculate_slice(struct vf_priv_s *priv) +{ + int x, y, h, row_stride, slice_y1 = -2, slice_y2 = -2; + uint8_t *p; + + p = priv->uva; + row_stride = priv->mpi_stride >> 1; + h = priv->mpi_h >> 1; + + stopwatch(3, "calculate_slice"); + + #define check_opaque(type) \ + if (*(type*)(p + x)) { \ + if (slice_y1 == -2) \ + slice_y1 = y; \ + else \ + slice_y2 = y; \ + x = row_stride; \ + break; \ + } + + for (y = 0; y < h; y++) { + for (x = 0; x < row_stride-7; x += 8) + check_opaque(uint64_t); + for (; x < row_stride-3; x += 4) + check_opaque(uint32_t); + for (; x < row_stride-1; x += 2) + check_opaque(uint16_t); + p += row_stride; + } + stopwatch(3, NULL); + priv->slice_y = clamp((slice_y1 - 2) * 2, 0, priv->mpi_h); + priv->slice_h = clamp((slice_y2 + 2) * 2, 0, priv->mpi_h) - priv->slice_y; +} + + + +/** + * \brief Do colorspace conversion from BGRA to "YV12A". + * + * \param priv Private data for this filter instance. + * \param ry Top of region to convert. + * \param rh Height of the region to convert. + * + * Converts an BGRA image to YV12 plus two alpha planes representing the alpha + * for the luma and chroma planes, scaling the overlay image to fit the frame + * size if necessary. + */ +static void +convert_bgra_to_yv12a(struct vf_priv_s *priv, int ry, int rh) +{ + int i, orig_y = ry, orig_h = rh, dst_y, dst_h, src_strides[3], dst_strides[3]; + uint8_t *p_alpha, *p_bgr24, *p_bgr32, *src[3], *dst[3]; + + /* We need to adjust the y-offset of the slice such that the overlay + * y-offset and the post-scaled mpi y-offset are both even (otherwise we + * see distortion in the chroma planes). If we do need to scale the + * overlay (i.e. overlay height is not mpi height), then increase the + * slice by a few rows on either side to cover any blending by the scaler. + */ + i = priv->h != priv->mpi_h ? 4 : 0; + do { + dst_y = ry = clamp((orig_y - i) & ~1, 0, priv->h); + dst_h = rh = clamp((orig_h + 1 + i*2) & ~1, 0, priv->h - ry); + translate_coords(priv, 0, &dst_y, 0, &dst_h); + i+=2; + } while (dst_y % 2 != 0 && i < ry); + + stopwatch(5, "convert_bgra_to_yv12a (%d - %d) -> (%d - %d)", ry, rh, dst_y, dst_h); + + // Decompose BGR32 into BGR24 plus alpha plane. + stopwatch(6, "decompose"); + p_alpha = priv->alpha_imgbuf + (ry * priv->w); + p_bgr24 = priv->bgr24_imgbuf + (ry * priv->w * 3); + p_bgr32 = priv->bgra_imgbuf + (ry * priv->w * 4); + + for (i = priv->w * rh; i > 0; i--, p_bgr32 += 4, p_alpha++, p_bgr24 += 3) { + // Moving 32 bits is faster than 3 separate assignments (or one 16 + // bit and and one 8 bit move). The BGR24 buffer has one extra byte + // allocated to prevent an overrun. + *(uint32_t *)p_bgr24 = *(uint32_t *)p_bgr32; + *p_alpha = p_bgr32[3]; + } + + stopwatch(6, NULL); + + // Source is BGR24 overlay image offset to top of slice. + src[0] = priv->bgr24_imgbuf + (ry * priv->w * 3); + src[1] = src[2] = 0; + src_strides[0] = priv->w * 3; + src_strides[1] = src_strides[2] = 0; + + // Dest is YV12 buffers offset to top of slice. + dst[0] = priv->y + (dst_y * priv->mpi_stride); + dst[1] = priv->u + ((dst_y * priv->mpi_stride) >> 2); + dst[2] = priv->v + ((dst_y * priv->mpi_stride) >> 2); + dst_strides[0] = priv->mpi_stride; + dst_strides[1] = priv->mpi_stride >> 1; + dst_strides[2] = priv->mpi_stride >> 1; + + // Scale BGR24 -> YV12 overlay image (without alpha) + sws_scale(priv->sws_bgr24, src, src_strides, 0, rh, dst, dst_strides); + + // Source is overlay-sized alpha plane offset to top of slice. + src[0] = priv->alpha_imgbuf + (ry * priv->w); + src[1] = src[2] = 0; + src_strides[0] = priv->w; + src_strides[1] = src_strides[2] = 0; + + // Dest is mpi-sized alpha for luma plane offset to top of slice. + dst[0] = priv->a + (dst_y * priv->mpi_stride); + dst[1] = dst[2] = 0; + dst_strides[0] = priv->mpi_stride; + dst_strides[1] = dst_strides[2] = 0; + + // Scale Y800 -> Y800 (luma alpha) + sws_scale(priv->sws_y800_l, src, src_strides, 0, rh, dst, dst_strides); + + // Dest is mpi-sized alpha for chroma plane offset to top of slice. + dst[0] = priv->uva + ((dst_y * priv->mpi_stride) >> 2); + dst_strides[0] = priv->mpi_stride >> 1; + + // Scale Y800 -> Y800 (chroma alpha) + sws_scale(priv->sws_y800_c, src, src_strides, 0, rh, dst, dst_strides); + + stopwatch(5, NULL); +} + + + +/** + * \brief Adds a rectangle to the list of invalid regions for the overlay. + * Rectangles within the list may overlap. + * + * \param priv Private data for this filter instance. + * \param x,y Top left coordinate of the invalid region + * \param w,h Width and height of the invalid region. + * \param type Type of invalidation. Can be one or both of RECT_CONVERT + * (convert from BGRA to YV12A) and RECT_PREMULTIPLY (do alpha + * premultiplication for that rectangle). + */ +static void +invalidate_rect(struct vf_priv_s *priv, int x, int y, int w, int h, int type) +{ + struct rects *r, *p; + + r = (struct rects *)malloc(sizeof(struct rects)); + // Round coordinates down to multiples of 2. + r->x = x & ~1; r->y = y & ~1; + // Round sizes up to multiples of 2. + r->w = (w + 1) & ~1; r->h = (h + 1) & ~1; + r->type = type; + r->next = NULL; + + //r->y=0;r->h=priv->h; + // Ensure coordinates are within the overlay image boundaries + if (r->x < 0) + r->x = 0; + else if (r->x > priv->w) + r->x = priv->w; + if (r->y < 0) + r->y = 0; + else if (r->y > priv->h) + r->y = priv->h; + if (r->w < 0) + r->w = 0; + else if (r->w > priv->w - r->x) + r->w = priv->w - r->x; + if (r->h < 0) + r->h = 0; + else if (r->h > priv->h - r->y) + r->h = priv->h - r->y; + + if (!priv->invalid_rects) { + priv->invalid_rects = r; + return; + } + + // Update any existing invalidated region to reflect the new type + for (p = priv->invalid_rects; p != NULL; p = p->next) { + if (p->x == x && p->y == y && p->w == w && p->h == h) { + p->type |= type; + return; + } + } + // Insert new rect at the front + r->next = priv->invalid_rects; + priv->invalid_rects = r; +} + + + +/** + * \brief Alpha multiplication (approximates division by 255). + * + * \param r The color value. + * \param a The alpha level (0 <= a <= 255). + * + * \return The alpha-multiplied value. + */ +static inline uint8_t +multiply_alpha(uint8_t r, uint8_t a) +{ + int temp = (r * a) + 0x80; + return ((temp + (temp >> 8)) >> 8); +} + + + +/// Blends src on top of dst at the given alpha level. +#define blend_byte(dst, src, alpha) multiply_alpha(dst, alpha) + src; + + +/** + * \brief Alpha-multiplies a byte and stores the result. + * + * \param byte The byte to be multiplied. + * \param alpha The alpha level of byte. + * \param dst_byte Pointer to where the alpha-mulplied byte will be stored. + * \param dst_alpha Pointer to where the alpha value for that byte will be stored. + * \param global_alpha The global alpha level (for the whole overlay image). + * + * This function calculates the average of the per-pixel alpha and the global + * alpha, stores that resulting average in dst_alpha, alpha-multiplies the + * byte with that averaged alpha, and stores the alpha-multiplied byte into + * dst_byte. + */ +static inline void +premultiply_alpha_byte(uint8_t byte, uint8_t alpha, + uint8_t *dst_byte, uint8_t *dst_alpha, + int global_alpha) +{ + uint8_t a = (global_alpha < 255) ? alpha * global_alpha >> 8 : alpha; + *dst_byte = multiply_alpha(byte, a); + *dst_alpha = 255-a; +} + + + +/** + * \brief Alpha-multiplies 8 consecutive bytes. C version. + */ +static void +premultiply_alpha_byte_8_C(uint8_t *byte, uint8_t *alpha, + uint8_t *dst_byte, uint8_t *dst_alpha, + int global_alpha) +{ + int i; + for (i = 0; i < 8; i++) + premultiply_alpha_byte(*(byte++), *(alpha++), dst_byte++, dst_alpha++, global_alpha); +} + + + +#ifdef HAVE_MMX +/** + * \brief Alpha-multiplies 8 consecutive bytes. MMX version. + */ +static void +premultiply_alpha_byte_8_MMX(uint8_t *byte, uint8_t *alpha, + uint8_t *dst_byte, uint8_t *dst_alpha, + int global_alpha) +{ + asm volatile( + "pxor %%mm7, %%mm7\n\t" // zero out %mm7 + "pcmpeqb %%mm4, %%mm4\n\t" // %mm4 = 255's + "movq (%3), %%mm5\n\t" // %mm5 = alpha + "cmp $255, %4\n\t" // don't apply layer alpha if it's 100% opaque + "je 42f\n\t" + + // Modify alpha from image with layer alpha + "movq %%mm5, %%mm6\n\t" // %mm6 = %mm5 = alpha + "punpcklbw %%mm7, %%mm5\n\t" // %mm5 = low dword of alpha + "punpckhbw %%mm7, %%mm6\n\t" // %mm6 = hi dword of alpha + "pmullw "MANGLE(MM_global_alpha)", %%mm5\n\t" // alpha * global_alpha + "pmullw "MANGLE(MM_global_alpha)", %%mm6\n\t" + "psrlw $8, %%mm5\n\t" // Divide by 256 + "psrlw $8, %%mm6\n\t" + "packuswb %%mm6, %%mm5\n\t" // Pack back into %mm5 + + "42: \n\t" + "movq %%mm4, %%mm6\n\t" // %mm4 = %mm6 = 255 + "psubb %%mm5, %%mm6\n\t" // %mm6 = 255 - alpha + "movq %%mm6, (%1)\n\t" // save modified alpha + + // Do alpha * bytes + "movq (%2), %%mm0\n\t" // %mm0 = byte + "movq %%mm0, %%mm1\n\t" // %mm1 = byte + "punpcklbw %%mm7, %%mm0\n\t" // %mm0 = low dword of bytes + "punpckhbw %%mm7, %%mm1\n\t" // %mm1 = hi dword of bytes + "movq %%mm5, %%mm6\n\t" // %mm5 = %mm6 = alpha + "punpcklbw %%mm7, %%mm5\n\t" // %mm5 = low dword alpha + "punpckhbw %%mm7, %%mm6\n\t" // %mm6 = hi dword alpha + "pmullw %%mm5, %%mm0\n\t" // alpha * bytes = (r*a) + "pmullw %%mm6, %%mm1\n\t" + // approximate division by 255 + "movq "MANGLE(MM_ROUND)", %%mm6\n\t" // %mm4 = round + "paddw %%mm6, %%mm0\n\t" // (r*a) + 0x80 + "paddw %%mm6, %%mm1\n\t" + "movq %%mm0, %%mm2\n\t" // temp = (r*a) + 0x80 + "movq %%mm1, %%mm3\n\t" + "psrlw $8, %%mm0\n\t" // temp >> 8 + "psrlw $8, %%mm1\n\t" + "paddw %%mm2, %%mm0\n\t" // temp + (temp >> 8) + "paddw %%mm3, %%mm1\n\t" + "psrlw $8, %%mm0\n\t" // (temp+(temp>>8))>>8 + "psrlw $8, %%mm1\n\t" + + "packuswb %%mm1, %%mm0\n\t" + "movq %%mm0, (%0)\n\t" + : "+r" (dst_byte), // %0 + "+r" (dst_alpha) // %1 + : "r" (byte), // %2 + "r" (alpha), // %3 + "r" (global_alpha)); // %4 +} +#endif + + + +/** + * \brief Alpha-multiplies 8 consecutive bytes. + * + * This function pointer is set during vf_open and is set to either + * premultiply_alpha_byte_8_C or premultiply_alpha_byte_8_MMX depending + * on CPU capabilities. + */ +static void +(*premultiply_alpha_byte_8)(uint8_t *byte, uint8_t *alpha, + uint8_t *dst_byte, uint8_t *dst_alpha, + int global_alpha); + + + +/** + * \brief Pre-alpha-multiply all pixels of the YV12A overlay image in the + * specified region. + * + * \param priv Private data for this filter instance. + * \param rx,ry Top left coordinate of region to premultiply. + * \param rw,rh Width and height of region to premultiply. + */ +static void +image_premultiply_alpha(struct vf_priv_s *priv, int rx, int ry, int rw, int rh) +{ + int w = priv->mpi_stride, global_alpha = priv->alpha; + uint8_t *y_ptr, *u_ptr, *v_ptr, *a_ptr, *uva_ptr, + *pre_y_ptr, *pre_u_ptr, *pre_v_ptr, *pre_a_ptr, *pre_uva_ptr; + int luma_offset, chroma_offset; + int x, y, chroma_stride; + + stopwatch(4, "premultiply_alpha (%d,%d %dx%d)", rx, ry, rw, rh); + + translate_coords(priv, &rx, &ry, &rw, &rh); + + if (global_alpha > 255) + global_alpha = 255; + + luma_offset = rx + ry*w; + chroma_offset = (rx>>1) + (ry>>1)*(w>>1); + + y_ptr = priv->y + luma_offset; + u_ptr = priv->u + chroma_offset; + v_ptr = priv->v + chroma_offset; + a_ptr = priv->a + luma_offset; + uva_ptr = priv->uva + chroma_offset; + + pre_y_ptr = priv->pre_y + luma_offset; + pre_u_ptr = priv->pre_u + chroma_offset; + pre_v_ptr = priv->pre_v + chroma_offset; + pre_a_ptr = priv->pre_a + luma_offset; + pre_uva_ptr = priv->pre_uva + chroma_offset; + +#ifdef HAVE_MMX + if (gCpuCaps.hasMMX) + MM_global_alpha = C64(global_alpha); +#endif + + chroma_stride = w >> 1; + for (y = 0; y < rh; y += 2) { + for (x = 0; x < (rw & ~7); x += 8) + premultiply_alpha_byte_8(&y_ptr[x], &a_ptr[x], &pre_y_ptr[x], &pre_a_ptr[x], global_alpha); + for (; x < rw; x++) + premultiply_alpha_byte(y_ptr[x], a_ptr[x], &pre_y_ptr[x], &pre_a_ptr[x], global_alpha); + + for (x = 0; x < ((rw >> 1) & ~7); x += 8) { + premultiply_alpha_byte_8(&u_ptr[x], &uva_ptr[x], &pre_u_ptr[x], &pre_uva_ptr[x], global_alpha); + premultiply_alpha_byte_8(&v_ptr[x], &uva_ptr[x], &pre_v_ptr[x], &pre_uva_ptr[x], global_alpha); + } + for (; x < rw >> 1; x++) { + premultiply_alpha_byte(u_ptr[x], uva_ptr[x], &pre_u_ptr[x], &pre_uva_ptr[x], global_alpha); + premultiply_alpha_byte(v_ptr[x], uva_ptr[x], &pre_v_ptr[x], &pre_uva_ptr[x], global_alpha); + } + y_ptr += w; + u_ptr += chroma_stride; + v_ptr += chroma_stride; + a_ptr += w; + uva_ptr += chroma_stride; + + pre_y_ptr += w; + pre_u_ptr += chroma_stride; + pre_v_ptr += chroma_stride; + pre_a_ptr += w; + pre_uva_ptr += chroma_stride; + + for (x = 0; x < (rw & ~7); x += 8) + premultiply_alpha_byte_8(&y_ptr[x], &a_ptr[x], &pre_y_ptr[x], &pre_a_ptr[x], global_alpha); + for (; x < rw; x++) + premultiply_alpha_byte(y_ptr[x], a_ptr[x], &pre_y_ptr[x], &pre_a_ptr[x], global_alpha); + + y_ptr += w; + a_ptr += w; + pre_y_ptr += w; + pre_a_ptr += w; + } +#ifdef HAVE_MMX + if (gCpuCaps.hasMMX) + asm volatile( "emms\n\t" ::: "memory" ); +#endif + stopwatch(4, NULL); +} + + + +/** + * \brief Blends one plane of the overlay onto the mpi in the given slice. + * C version. + * + * \param w Width to blend (either the width of the overlay or the width of the + * mpi, whichever is smaller). + * \param slice_h Number of rows to blend. + * \param dst Pointer to the buffer that will receive all blended bytes + * for this plane. + * \param src Pointer to the mpi plane buffer. + * \param overlay Pointer to the overlay plane buffer. + * \param alpha Pointer to the alpha for this plane. + * \param mpi_stride Stride for src + * \param dst_stride Stride for overlay + * + * No bounds checking is performed, so the caller is responsible for ensuring + * that all pointers are properly positioned and are sufficiently large. + */ +static void +blend_plane_C(int w, int slice_h, uint8_t *dst, uint8_t *src, + uint8_t *overlay, uint8_t *alpha, int mpi_stride, + int dmpi_stride, int overlay_stride) +{ + int x, y; + for (y = 0; y < slice_h; y++) { + for (x = 0; x < w; x++) + *(dst + x) = blend_byte(*(src+x), *(overlay+x), *(alpha+x)); + dst += dmpi_stride; + src += mpi_stride; + overlay += overlay_stride; + alpha += overlay_stride; + } +} + + + +#ifdef HAVE_MMX +/** + * \brief Blends one plane of the overlay onto the mpi in the given slice. + * MMX version. + * \see blend_plane_C for parameter details + */ +static void +blend_plane_MMX(int w, int slice_h, uint8_t *dst, uint8_t *src, + uint8_t *overlay, uint8_t *alpha, int mpi_stride, + int dmpi_stride, int overlay_stride) +{ + int i, y, q = w / 8, r = w % 8; + + for (y = 0; y < slice_h; y++) { + if (q) { + asm volatile( + "xor %%"REG_c", %%"REG_c"\n\t" + + "1: \n\t" + "movq (%1, %%"REG_c"), %%mm0\n\t" // %mm0 = mpi + "movq %%mm0, %%mm1\n\t" // %mm1 = mpi + "movq (%3, %%"REG_c"), %%mm2\n\t" // %mm2 = %mm3 = 255 - alpha + "movq %%mm2, %%mm3\n\t" + + "punpcklbw %%mm7, %%mm0\n\t" // %mm0 = low dword of mpi + "punpckhbw %%mm7, %%mm1\n\t" // %mm1 = hi dword of mpi + "punpcklbw %%mm7, %%mm2\n\t" // %mm0 = low dword of 255-a + "punpckhbw %%mm7, %%mm3\n\t" // %mm1 = hi dword of 255-a + "pmullw %%mm2, %%mm0\n\t" // (255-a) * mpi = (r*a) + "pmullw %%mm3, %%mm1\n\t" + // approximate division by 255 + "paddw %%mm5, %%mm0\n\t" // (r*a) + 0x80 + "paddw %%mm5, %%mm1\n\t" + "movq %%mm0, %%mm2\n\t" // temp = (r*a) + 0x80 + "movq %%mm1, %%mm3\n\t" + "psrlw $8, %%mm0\n\t" // temp >> 8 + "psrlw $8, %%mm1\n\t" + "paddw %%mm2, %%mm0\n\t" // temp + (temp >> 8) + "paddw %%mm3, %%mm1\n\t" + "psrlw $8, %%mm0\n\t" // (temp+(temp>>8))>>8 + "psrlw $8, %%mm1\n\t" + + // MPI plane now alpha-multiplied. Add to premultiplied + // overlay plane. + "movq (%2, %%"REG_c"), %%mm2\n\t" // %mm2 = src image (overlay) + "packuswb %%mm1, %%mm0\n\t" + "paddb %%mm2, %%mm0\n\t" + "movq %%mm0, (%0, %%"REG_c")\n\t" // Store to dst (mpi) + + "add $8, %%"REG_c"\n\t" + "cmp %4, %%"REG_c"\n\t" + "jb 1b \n\t" + + : "+r" (dst), // %0 + "+r" (src), // %1 + "+r" (overlay), // %2 + "+r" (alpha) // %3 + : "m" (w) // %4 + : "%"REG_c); + } + // Blend the last few pixels of this row ... + if (r) { + for (i = 0; i < r; i++) + *(dst+i) = blend_byte(*(src+i), *(overlay+i), *(alpha+i)); + } + src += mpi_stride; + dst += dmpi_stride; + alpha += overlay_stride; + overlay += overlay_stride; + } +} +#endif + + + +/** + * \brief Blends one plane of the overlay onto the mpi. + * \see blend_plane_C for parameter details. + * + * This function pointer is set during vf_open and is set to either + * blend_plane_C or blend_plane_MMX depending on CPU capabilities. + */ +static void +(*blend_plane)(int w, int slice_h, uint8_t *dst, uint8_t *src, + uint8_t *overlay, uint8_t *alpha, int mpi_stride, + int dmpi_stride, int overlay_stride); + + + +/** + * \brief Blends the overlay onto the mpi. + * + * \param priv Private data for this filter instance. + * \param src_mpi The source mpi (as given the vf_put_image) + * \param dst_mpi The destination mpi. + * + * This function composites the overlay over the video mpi in the slice + * region specified in a slave command. (If no slice region has been + * explicitly set, it defaults to the whole overlay image.) If the global + * alpha is 256, the overlay is simply memcpy'd to the dst_mpi in the slice + * region, thus ignoring the per-pixel alpha values of the overlay (in that + * slice). (Pixels outside the slice are copied from src_mpi). If the global + * alpha is 255 or less, then each pixel of the overlay is composited over + * the src_mpi. + * + * The overlay is clipped to the dimensions of the mpi. + */ +static inline void +blend_image(struct vf_priv_s *priv, mp_image_t *src_mpi, mp_image_t *dst_mpi) +{ + int slice_y, slice_h, w, i, c, plane, overlay_stride[3]; + uint8_t *dst_mpi_planes[3], *src_mpi_planes[3], *overlay, *src, *dst, *alpha, + *overlay_planes[3] = { priv->pre_y, priv->pre_u, priv->pre_v }, + *alpha_planes[3] = { priv->pre_a, priv->pre_uva, priv->pre_uva }; + + // Clip the slice to the mpi image. Slice region is already translated + // to mpi size. + slice_y = priv->slice_y; + slice_h = priv->slice_h; + + if (slice_y < 0) + slice_y = 0; + else if (slice_y > src_mpi->height) + slice_y = src_mpi->height; + + if (slice_h < 0) + slice_h = 0; + else if (slice_h > src_mpi->height - slice_y) + slice_h = src_mpi->height - slice_y; + + stopwatch(4, "blend_image (0,%d, %dx%d)", slice_y, priv->mpi_w, slice_h); + + for (i = 0, c = 0; i < 3; i++, c = 1) { + // Setup buffer positions for overlay, mpi src and mpi dst. + overlay_stride[i] = priv->mpi_stride >> c; + dst_mpi_planes[i] = dst_mpi->planes[i] + ((slice_y >> c) * dst_mpi->stride[i]); + src_mpi_planes[i] = src_mpi->planes[i] + ((slice_y >> c) * src_mpi->stride[i]); + overlay_planes[i] += (slice_y >> c) * overlay_stride[i]; + alpha_planes[i] += (slice_y >> c) * overlay_stride[i]; + + if (src_mpi == dst_mpi) + continue; + + // If we're compositing only a slice, copy the parts of the mpi + // above and below the slice. + if (slice_y > 0) + // MPI above the overlay slice. + memcpy_pic(dst_mpi->planes[i], src_mpi->planes[i], src_mpi->w, slice_y >> c, + dst_mpi->stride[i], src_mpi->stride[i]); + if (slice_h >= 0 && slice_y + slice_h < src_mpi->height) + // MPI below the overlay slice. + memcpy_pic(dst_mpi->planes[i] + dst_mpi->stride[i] * ((slice_y+slice_h) >> c), + src_mpi->planes[i] + src_mpi->stride[i] * ((slice_y+slice_h) >> c), + src_mpi->w, (src_mpi->height-(slice_y+slice_h)) >> c, + dst_mpi->stride[i], src_mpi->stride[i]); + } + +#ifdef HAVE_MMX + if(gCpuCaps.hasMMX) { + asm volatile( + "pxor %%mm7, %%mm7\n\t" // zero out %mm7 + "movq "MANGLE(MM_ROUND)", %%mm5\n\t" // %mm5 = round + ::: "memory" + ); + } +#endif + + for (w = priv->mpi_stride, plane = 0; plane < 3; plane++) { + if (plane == 1) { + w >>= 1; + slice_h >>= 1; + } + overlay = overlay_planes[plane]; + alpha = alpha_planes[plane]; + src = src_mpi_planes[plane]; + dst = dst_mpi_planes[plane]; + + // Global alpha is 256 which means ignore per-pixel alpha. Do + // straight memcpy. + if (priv->alpha == 256) { + memcpy_pic(dst, overlay, w, slice_h, dst_mpi->stride[plane], src_mpi->stride[plane]); + } else { + blend_plane(w, slice_h, dst, src, overlay, alpha, src_mpi->stride[plane], + dst_mpi->stride[plane], overlay_stride[plane]); + } + } + +#ifdef HAVE_MMX + if(gCpuCaps.hasMMX) + asm volatile( "emms\n\t" ::: "memory" ); +#endif + stopwatch(4, NULL); +} + + + +/** + * \brief Process a frame. + * + * \param vf Instance of this filter. + * \param mpi The image sent by the previous filter (or decoder). + * + * \return The return code of the next filter, or 0 on error. + * + * This function is called when a new video frame is to be drawn or when the + * overlay needs updating. If the lockbyte of the overlay shared memory + * buffer is set to BUFFER_LOCKED, it means the controlling application has + * made changes that need processing. All invalidated rectangles are + * converted from BGRA to YV12(A) and those regions are pre-alpha-multiplied. + * Subsequently, if the overlay is visible, it is composited over the mpi. + */ +static int +put_image(struct vf_instance_s* vf, mp_image_t* mpi, double pts) +{ + mp_image_t *dmpi = NULL; + + // Remember last mp image. + vf->priv->last_mpi = mpi; + + /* If the controlling application has locked the buffer and issued a + * slave command (which causes dirty to be 1) and it's time to update + * (i.e. more than 1/30th of a second has elapsed), then we process the + * invalidated regions by doing BGRA -> YV12A conversion and pre- + * alpha-multiply those areas. + */ + if (*vf->priv->lockbyte & BUFFER_LOCKED && vf->priv->dirty && should_update(vf->priv, 1)) { + struct rects *r, *next; + stopwatch(3, "putimage (convert)"); + r = vf->priv->invalid_rects; + while (r) { + if (r->type & RECT_CONVERT) + convert_bgra_to_yv12a(vf->priv, r->y, r->h); + if (r->type & RECT_PREMULTIPLY) + image_premultiply_alpha(vf->priv, r->x, r->y, r->w, r->h); + next = r->next; + free(r); + r = next; + } + vf->priv->invalid_rects = 0; + // YV12A version is fully up-to-date now, we can unlock the BGRA + // buffer. + *vf->priv->lockbyte = BUFFER_UNLOCKED; + vf->priv->dirty = 0; + + if (vf->priv->auto_slice) + calculate_slice(vf->priv); + } else + stopwatch(3, "putimage (no convert)"); + + if (vf->priv->visible != 0 && vf->priv->alpha > 0) { + // Overlay is visible. + dmpi = vf_get_image(vf->next, mpi->imgfmt, mpi->type, mpi->flags, vf->priv->mpi_w, vf->priv->mpi_h); + blend_image(vf->priv, mpi, dmpi); + } else if (!dmpi){ + // Overlay is hidden, so do a shallow copy. + dmpi = vf_get_image(vf->next, mpi->imgfmt, MP_IMGTYPE_EXPORT, MP_IMGFLAG_PRESERVE, + vf->priv->mpi_w, vf->priv->mpi_h); + dmpi->planes[0] = mpi->planes[0]; + dmpi->stride[0] = mpi->stride[0]; + if (dmpi->flags & MP_IMGFLAG_PLANAR) { + dmpi->planes[1] = mpi->planes[1]; + dmpi->stride[1] = mpi->stride[1]; + dmpi->planes[2] = mpi->planes[2]; + dmpi->stride[2] = mpi->stride[2]; + } + } + stopwatch(3, NULL); + return vf_next_put_image(vf, dmpi, pts); +} + + + +/** + * \brief Handle a slave command. + * + * \param cmd Structure holding the data for this command. + * \param paused The paused state of the video. + * \param priv Private data for this filter instance. + * + * This function is registered with a call to mp_input_add_cmd_filter() in + * vf_open and is used to handle MP_CMD_VF_OVERLAY (the "overlay" slave command), + * as well as track the pause state of the video. + * + * Slave command argument is a string in the form: cmd=args[,cmd=args[, ... ]] + * Possible commands are: + * + * invalidate=x:y:w:h + * Cause the specified rectangle to be updated on the overlay. + * (Internally this forces BGRA->YV12A colorspace conversion.) + * slice=y:h + * Draw only the specified slice (top / height) of the overlay. If + * -1:-1 are specified, use autoslicing. + * visible=val + * Draw overlay if val is 1, or don't draw overlay if val is 0 + * alpha=val + * Sets the global alpha level for the overlay. val==0 is + * semantically equivalent to visible=0; 256 means don't alpha blend. + * + * See DOCS/tech/vf_overlay.txt for more details. + */ +static int +cmd_filter(mp_cmd_t *cmd, int paused, struct vf_priv_s *priv) +{ + if (cmd->id == MP_CMD_VF_OVERLAY) { + char *p1, *p2, *args = cmd->args[0].v.s; + while (args && (p1 = strsep(&args, ","))) { + p2 = strpbrk(p1, "=,"); + if (!p2 || *(p2+1) == 0) + // Command with no arguments, must be malformed. + continue; + *p2 = 0; + + if (!strcasecmp(p1, "invalidate")) { + int x, y, w, h; + if (sscanf(p2+1, "%d:%d:%d:%d", &x, &y, &w, &h) == 4) + invalidate_rect(priv, x, y, w, h, RECT_CONVERT | RECT_PREMULTIPLY); + } + else if (!strcasecmp(p1, "slice")) { + int y, h; + if (sscanf(p2+1, "%d:%d", &y, &h) == 2) { + if (y == -1 || h == -1) { + priv->auto_slice = 1; + calculate_slice(priv); + } else { + translate_coords(priv, 0, &y, 0, &h); + priv->slice_y = y; + priv->slice_h = h; + priv->auto_slice = 0; + } + } + } + else if (!strcasecmp(p1, "alpha")) { + int alpha; + if (sscanf(p2+1, "%d", &alpha) == 1 && alpha != priv->alpha) { + priv->alpha = alpha; + fprintf(stderr, "@@@ ALPHA: %d\n", alpha); + invalidate_rect(priv, 0, 0, priv->w, priv->h, RECT_PREMULTIPLY); + } + } + else if (!strcasecmp(p1, "visible")) { + sscanf(p2+1, "%d", &priv->visible); + } + priv->dirty = 1; + } + // This command is handled, so return 1. This causes mp_input_get_cmd + // to return NULL and if we're paused, it keeps us paused. + return 1; + } + + if (cmd->id == MP_CMD_PAUSE) + priv->is_paused = !paused; + + return 0; +} + + + +/** + * \brief Handle VFCTRL commands. + * + * \param vf Instance of this filter. + * \param request The VFCTRL_* request to handle. + * \param data The data for the given VFCTRL command. + * + * \return The return value of the next filter. + * + * This function handles VFCTRL_PERIODIC_UPDATE which is called inside the pause + * and sleep looPS in mplayer.c. + */ +static int +control(struct vf_instance_s *vf, int request, void *data) +{ + /** \bug FIXME: Can't update if we're using double buffering. This means for + * double buffering the overlay update speed is only as fast as the video + * frame rate. + */ + if (request == VFCTRL_PERIODIC_UPDATE && (!vo_doublebuffering || vf->priv->is_paused)) { + //float time_avail = *(float *)data; + if (vf->priv->last_mpi && *vf->priv->lockbyte & BUFFER_LOCKED && should_update(vf->priv, 0)) { + // Process pending slave commands if we're not paused. + mp_input_get_cmd(0,0,1); + if (vf->priv->dirty) { + put_image(vf, vf->priv->last_mpi, 0); + // return CONTROL_TRUE to force page flip + return vf_next_control(vf, request, data), CONTROL_TRUE; + } + } + // Returns CONTROL_FALSE to consume the event unless a later filter + // returns CONTROL_TRUE. + return vf_next_control(vf, request, data) > 0 ? CONTROL_TRUE : CONTROL_FALSE; + } + return vf_next_control(vf, request, data); +} + + +/** + * \brief Initialize the overlay filter. + * + * \param vf Instance of this filter. + * \param args The arguments passed from the command line for this instance. + * The only argment is an integer representing the shared memory + * key. + * + * vf_overlay instances are intended to be "persistent"; in other words, they + * never get ununitialized. This is to allow overlay buffers to survive + * loadfile or a loop. + */ +static int +open(vf_instance_t* vf, char* args) +{ + int i = 0; + key_t shm_key; + + vf->config = config; + vf->put_image = put_image; + vf->query_format = query_format; + vf->control = control; + vf->uninit = NULL; // persistent + + if(!args || sscanf(args, "%u", &shm_key) < 1 ) { + mp_msg(MSGT_VFILTER, MSGL_ERR, "vf_overlay: bad args; usage: overlay=shmkey\n"); + return 0; + } + + // Check to see if we've already initialized a filter with this shmkey. If + // we have, then we reuse the private data, which allows image layers to + // survive a loadfile or a loop. + if (vf_overlay_priv) { + struct vf_priv_s *p; + for (i = 0, p = vf_overlay_priv[i]; i < num_instances; p = vf_overlay_priv[++i]) { + if (p->shm_key == shm_key) { + vf->priv = p; + return 1; + } + } + } + else { + /* Initial load; register with atexit to handle unlinking shmem + * objects. We don't do this with the filter's uninit because uninit + * gets called during a loadfile and we don't want to lose overlay data + * during loadfiles. + */ + premultiply_alpha_byte_8 = premultiply_alpha_byte_8_C; + blend_plane = blend_plane_C; +#ifdef HAVE_MMX + if(gCpuCaps.hasMMX) { + premultiply_alpha_byte_8 = premultiply_alpha_byte_8_MMX; + blend_plane = blend_plane_MMX; + } +#endif + atexit(free_all_overlay); + } + + // New filter, so create and initialize the private data + vf->priv = calloc(1, sizeof(struct vf_priv_s)); + vf->priv->shm_key = shm_key; + + mp_input_add_cmd_filter((mp_input_cmd_filter)cmd_filter, vf->priv); + + // Grow the arrays and keep track of the private data, as well as the instance + // structs, used for handling the pause loop. + vf_overlay_priv = (struct vf_priv_s **)realloc(vf_overlay_priv, i + 1); + vf_overlay_priv[i] = vf->priv; + num_instances = i + 1; + return 1; +} + + + +/** + * \brief Info about this filter for registering with vf. + */ +const vf_info_t vf_info_overlay = { + "Shared memory image overlay with alpha compositing", + "overlay", + "Jason Tackaberry", + "", + open, + NULL +}; + +#endif Index: libmpcodecs/vf_outbuf.c =================================================================== --- libmpcodecs/vf_outbuf.c (revision 0) +++ libmpcodecs/vf_outbuf.c (revision 0) @@ -0,0 +1,262 @@ +#include "config.h" + +#ifdef HAVE_SHM + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mp_image.h" +#include "vf.h" +#include "img_format.h" + +#include + +#include "mp_msg.h" +#include "libvo/fastmemcpy.h" +#include "libvo/video_out.h" +#include "libswscale/swscale.h" +#include "input/input.h" +#include "osdep/timer.h" +#include "libavutil/avutil.h" +#include "vf_scale.h" + +// defined in fmt-conversion.h and included by vf_scale.c +enum PixelFormat imgfmt2pixfmt(int fmt); + +// Lock flags. +// +#define BUFFER_UNLOCKED 0x10 +#define BUFFER_LOCKED 0x20 + + +struct vf_priv_s { + int mpi_w, mpi_h, active, mpifmt, buffmt, + dst_w, dst_h; + double aspect; + uint8_t *shmem; + int shm_id; + key_t shm_key; + struct SwsContext *sws; +}; + +static void setup_sws(struct vf_priv_s *priv) +{ + if (priv->sws) + sws_freeContext(priv->sws); + + priv->sws = sws_getContext(priv->mpi_w, priv->mpi_h, imgfmt2pixfmt(priv->mpifmt), + priv->dst_w, priv->dst_h, imgfmt2pixfmt(priv->buffmt), + get_sws_cpuflags()|SWS_PRINT_INFO|SWS_BILINEAR, + NULL, NULL, NULL); +} + + +static int +config(struct vf_instance_s* vf, int width, int height, + int d_width, int d_height, unsigned int flags, unsigned int mpifmt) +{ + int fd, size; + vf->priv->dst_w = vf->priv->mpi_w = (width+1) & ~1; + vf->priv->dst_h = vf->priv->mpi_h = (height+1) & ~1; + size = vf->priv->dst_w * vf->priv->dst_h * 4 + 16; + + vf->priv->shm_id = shmget(vf->priv->shm_key, size, IPC_CREAT | 0600); + if (vf->priv->shm_id < 0) { + mp_msg(MSGT_VFILTER, MSGL_ERR, "\noutbuf: ERROR: unable to open shmem (key %d)\n", vf->priv->shm_key); + return 0; + } + vf->priv->shmem = shmat(vf->priv->shm_id, NULL, 0); + if (!vf->priv->shmem) { + mp_msg(MSGT_VFILTER, MSGL_ERR, "\noutbuf: ERROR: couldn't mmap %d bytes from shmem (%d)\n", size, vf->priv->shm_key); + return 0; + } + + vf->priv->mpifmt = mpifmt; + vf->priv->active = 1; + vf->priv->aspect = (double)d_width/d_height; + mp_msg(MSGT_VFILTER, MSGL_INFO, "outbuf: %dx%d %s; shmem key: %u.\n", + width, height, vo_format_name(vf->priv->buffmt), vf->priv->shm_key); + + setup_sws(vf->priv); + return vf_next_config(vf, width, height, d_width, d_height, flags, mpifmt); +} + + +static int +query_format(struct vf_instance_s* vf, unsigned int fmt) +{ + if (fmt == IMGFMT_YV12) + return vf_next_query_format(vf, fmt); + return 0; +} + + +static int +put_image(struct vf_instance_s* vf, mp_image_t* mpi, double pts) +{ + struct { + short lock, width, height; + double aspect; + } header = { + .lock = BUFFER_UNLOCKED, + .width = vf->priv->dst_w, + .height = vf->priv->dst_h, + .aspect = vf->priv->aspect + }; + + struct timeval curtime; + struct timezone tz; + double start_time, now; + + if (vf->priv->active == 0) + return 0; + + if (vf->priv->active == 1) { + return vf_next_put_image(vf, mpi, pts); + } + + if (vf->priv->mpi_w != mpi->width || vf->priv->mpi_h != mpi->height) { + vf->priv->mpi_w = mpi->width; + vf->priv->mpi_h = mpi->height; + setup_sws(vf->priv); + } + + // Wait at most 0.1 seconds for the client to unlock the buffer. + gettimeofday(&curtime, &tz); + start_time = now = curtime.tv_sec + (curtime.tv_usec/(1000.0*1000)); + while (*vf->priv->shmem & BUFFER_LOCKED && now - start_time < 0.1) { + gettimeofday(&curtime, &tz); + now = curtime.tv_sec + (curtime.tv_usec/(1000.0*1000)); + usec_sleep(1); + } + + if (!(*vf->priv->shmem & BUFFER_LOCKED) || (now - start_time >= 0.1)) + fprintf(stderr, "@@@ outbuf timeout: lock=%d time=%f\n", *vf->priv->shmem & BUFFER_LOCKED, now-start_time); + + + if (now - start_time < 0.1) { + switch (vf->priv->buffmt) { + case IMGFMT_BGR32: { + uint8_t *dst[3]= {vf->priv->shmem + 16, NULL, NULL}; + int dst_stride[3]= {vf->priv->dst_w * 4, 0, 0}; + sws_scale(vf->priv->sws, mpi->planes, mpi->stride, 0, vf->priv->mpi_h, dst, dst_stride); + break; + } + case IMGFMT_YV12: { + uint8_t *dst[3]; + int dst_stride[3], i, stride = vf->priv->dst_w, h = vf->priv->dst_h; + unsigned char *p = vf->priv->shmem + 16; + + for (i = 0; i < 3; p += stride * h, i++) { + if (i == 1) { stride >>= 1; h >>= 1;} + dst[i] = p; + dst_stride[i] = stride; + } + sws_scale(vf->priv->sws, mpi->planes, mpi->stride, 0, vf->priv->mpi_h, dst, dst_stride); + break; + } + } + memcpy(vf->priv->shmem, &header, sizeof(header)); + *vf->priv->shmem = BUFFER_LOCKED; + } + + if (vf->priv->active == 3) + return vf_next_put_image(vf, mpi, pts); + + return 0; + +} + +static int +cmd_filter(mp_cmd_t* cmd, int paused, struct vf_priv_s * priv) +{ + if (cmd->id == MP_CMD_VF_OUTBUF) { + int w = priv->dst_w, h = priv->dst_h; + priv->active = cmd->args[0].v.i; + if (cmd->nargs > 1) + w = cmd->args[1].v.i ? cmd->args[1].v.i : priv->mpi_w; + if (cmd->nargs > 2) + h = cmd->args[2].v.i ? cmd->args[2].v.i : priv->mpi_h; + + w = (w > priv->mpi_w) ? priv->mpi_w : w; + h = (h > priv->mpi_h) ? priv->mpi_h : h; + if (w != priv->dst_w || h != priv->dst_h) { + priv->dst_w = (w+1) & ~1; + priv->dst_h = (h+1) & ~1; + setup_sws(priv); + } + + return 1; + } + return 0; +} + +static void uninit(struct vf_instance_s *vf) +{ + struct shmid_ds shmemds; + + sws_freeContext(vf->priv->sws); + shmctl(vf->priv->shm_id, IPC_RMID, &shmemds); + shmdt(vf->priv->shmem); + + free(vf->priv); +} + + +static int +vf_open(vf_instance_t* vf, char* args) +{ + char *p; + int argn = 0; + + vf->config = config; + vf->put_image = put_image; + vf->query_format = query_format; + vf->uninit = uninit; + + vf->priv = calloc(1, sizeof(struct vf_priv_s)); + vf->priv->active = 1; + vf->priv->buffmt = IMGFMT_YV12; + + while ((p = strsep(&args, ":"))) { + //if (argn == 0) strcpy(vf->priv->shmem_name, p); + if (argn == 0) vf->priv->shm_key = atoi(p); + else if (argn == 1) { + if (!strcasecmp(p, "bgr32")) + vf->priv->buffmt = IMGFMT_BGR32; + else if (!strcasecmp(p, "yv12")) + vf->priv->buffmt = IMGFMT_YV12; + else { + mp_msg(MSGT_VFILTER, MSGL_ERR, "\noutbuf: ERROR: invalid format '%s'\n", p); + return 0; + } + } + else if (argn == 2) + vf->priv->active = atoi(p); + argn++; + } + + mp_input_add_cmd_filter((mp_input_cmd_filter)cmd_filter, vf->priv); + return 1; +} + + +const vf_info_t vf_info_outbuf = { + "Write video frame to shared memory", + "outbuf", + "Jason Tackaberry", + "", + vf_open, + NULL +}; + +#endif Index: libmpcodecs/vf.c =================================================================== --- libmpcodecs/vf.c (revision 27232) +++ libmpcodecs/vf.c (working copy) @@ -99,6 +99,10 @@ extern const vf_info_t vf_info_blackframe; extern const vf_info_t vf_info_geq; extern const vf_info_t vf_info_ow; +#ifdef HAVE_SHM +extern vf_info_t vf_info_overlay; +extern vf_info_t vf_info_outbuf; +#endif // list of available filters: static const vf_info_t* const filter_list[]={ @@ -190,6 +194,10 @@ #endif &vf_info_yadif, &vf_info_blackframe, +#ifdef HAVE_SHM + &vf_info_overlay, + &vf_info_outbuf, +#endif &vf_info_ow, NULL }; Index: libmpcodecs/vf.h =================================================================== --- libmpcodecs/vf.h (revision 27232) +++ libmpcodecs/vf.h (working copy) @@ -70,6 +70,7 @@ int value; } vf_equalizer_t; +#define VFCTRL_NOTIFY_PTS 99 #define VFCTRL_QUERY_MAX_PP_LEVEL 4 /* test for postprocessing support (max level) */ #define VFCTRL_SET_PP_LEVEL 5 /* set postprocessing level */ #define VFCTRL_SET_EQUALIZER 6 /* set color options (brightness,contrast etc) */ @@ -86,6 +87,7 @@ #define VFCTRL_GET_PTS 17 /* Return last pts value that reached vf_vo*/ #define VFCTRL_SET_DEINTERLACE 18 /* Set deinterlacing status */ #define VFCTRL_GET_DEINTERLACE 19 /* Get deinterlacing status */ +#define VFCTRL_PERIODIC_UPDATE 20 /* Called whenever MPlayer is idle */ #include "vfcap.h" Index: input/input.c =================================================================== --- input/input.c (revision 27232) +++ input/input.c (working copy) @@ -191,7 +191,12 @@ { MP_CMD_SEEK_CHAPTER, "seek_chapter", 1, { {MP_CMD_ARG_INT,{0}}, {MP_CMD_ARG_INT,{0}}, {-1,{0}} } }, { MP_CMD_SET_MOUSE_POS, "set_mouse_pos", 2, { {MP_CMD_ARG_INT,{0}}, {MP_CMD_ARG_INT,{0}}, {-1,{0}} } }, - + +#ifdef HAVE_SHM + { MP_CMD_VF_OVERLAY, "overlay", 1, { {MP_CMD_ARG_STRING,{0}}, {-1,{0}}}}, + { MP_CMD_VF_OUTBUF, "outbuf", 1, { {MP_CMD_ARG_INT,{0}}, {-1,{0}}}}, +#endif + { 0, NULL, 0, {} } }; Index: input/input.h =================================================================== --- input/input.h (revision 27232) +++ input/input.h (working copy) @@ -3,6 +3,8 @@ // All command IDs typedef enum { + MP_CMD_VF_OVERLAY, + MP_CMD_VF_OUTBUF, MP_CMD_SEEK, MP_CMD_AUDIO_DELAY, MP_CMD_QUIT, Index: Makefile =================================================================== --- Makefile (revision 27232) +++ Makefile (working copy) @@ -140,6 +140,8 @@ libmpcodecs/vf_mirror.c \ libmpcodecs/vf_noformat.c \ libmpcodecs/vf_noise.c \ + libmpcodecs/vf_outbuf.c \ + libmpcodecs/vf_overlay.c \ libmpcodecs/vf_ow.c \ libmpcodecs/vf_palette.c \ libmpcodecs/vf_perspective.c \ Index: DOCS/tech/slave.txt =================================================================== --- DOCS/tech/slave.txt (revision 27232) +++ DOCS/tech/slave.txt (working copy) @@ -195,6 +195,10 @@ osd_show_text [duration] [level] Show on the OSD. +overlay + Manipulate the overlay filter. See DOCS/tech/vf_overlay.txt for a + detailed description of what commands are available. + panscan <-1.0 - 1.0> | <0.0 - 1.0> Increase or decrease the pan-and-scan range by , 1.0 is the maximum. Negative values decrease the pan-and-scan range. Index: DOCS/man/en/mplayer.1 =================================================================== --- DOCS/man/en/mplayer.1 (revision 27232) +++ DOCS/man/en/mplayer.1 (working copy) @@ -6992,6 +6992,35 @@ .PD 1 . .TP +.B overlay=shmkey +Provides an overlay image buffer that can be accessed via shared memory. +This filter can be used by applications controlling MPlayer to provide a +custom on-screen display. +The overlay image is composited over the running video and supports global +and per-pixel alpha blending. +Pixels are specified in BGRA format, and the size of the overlay image is the +video display size. +In order to have a specific overlay image size, you can precede this filter +with the scale, expand, and/or dsize filters. +.sp 1 +The filter is controlled by the overlay slave command, and it may be used +even when the video is paused. +This slave command can be used to update regions of the overlay image, toggle +visibility, adjust the global alpha level, etc. +.sp 1 +.PD 0 +.RSs +.IPs +an integer that will be used as the key for the SysV shared memory segment +.RE +.PD 1 +.sp 1 +.RS +See DOCS/tech/vf_overlay.txt for more details about how this filter works and +how to control it with the slave command. +.REss +. +.TP .B framestep=I|[i]step Renders only every nth frame or every intra frame (keyframe). .sp 1 Index: mplayer.c =================================================================== --- mplayer.c (revision 27232) +++ mplayer.c (working copy) @@ -1327,6 +1327,30 @@ } +/** + * \brief send a VFCTRL_PERIODIC_UPDATE through the filter chain + * \param vf first video filter to receive the control + * \param vo needed to flip if requested + * \param time_avail time available, might decide not to send the + * control if this is too small + * \return 1 if the event was consumed or not sent, 0 otherwise + * + * When the filter chain returns CONTROL_TRUE, a flip_page + * will be performed, a value < 0 means it was not consumed, + * causing less events to be produced in the future + */ +static int periodic_update(vf_instance_t *vf, const vo_functions_t *vo, + float time_avail) { + int res; + if (!vf) return 0; + if (time_avail < 0.01) return 1; + res = vf->control(vf, VFCTRL_PERIODIC_UPDATE, &time_avail); + if (res == CONTROL_OK && vo_config_count) + vo->flip_page(); + return res > 0; +} + + typedef struct mp_osd_msg mp_osd_msg_t; struct mp_osd_msg { /// Previous message on the stack. @@ -1786,6 +1810,8 @@ current_module="sleep_rtc"; while (time_frame > 0.000) { unsigned long rtc_ts; + periodic_update(mpctx->sh_video->vfilter, mpctx->video_out, time_frame); + time_frame-=GetRelativeTime(); if (read(rtc_fd, &rtc_ts, sizeof(rtc_ts)) <= 0) mp_msg(MSGT_CPLAYER, MSGL_ERR, MSGTR_LinuxRTCReadError, strerror(errno)); time_frame -= GetRelativeTime(); @@ -1798,8 +1824,10 @@ float margin = softsleep ? 0.011 : 0; current_module = "sleep_timer"; while (time_frame > margin) { - usec_sleep(1000000 * (time_frame - margin)); + int mul = 100 * !periodic_update(mpctx->sh_video->vfilter, mpctx->video_out, time_frame); time_frame -= GetRelativeTime(); + usec_sleep(10000 * mul * (time_frame - margin)); + time_frame -= GetRelativeTime(); } if (softsleep){ current_module = "sleep_soft"; @@ -2352,7 +2380,7 @@ if (mpctx->audio_out && mpctx->sh_audio) mpctx->audio_out->pause(); // pause audio, keep data if possible - while ( (cmd = mp_input_get_cmd(20, 1, 1)) == NULL + while ( (cmd = mp_input_get_cmd(3, 1, 1)) == NULL || cmd->id == MP_CMD_SET_MOUSE_POS) { if (cmd) { cmd = mp_input_get_cmd(0,1,0); @@ -2373,6 +2401,9 @@ if (vf_menu) vf_menu_pause_update(vf_menu); #endif + if (mpctx->sh_video && periodic_update(mpctx->sh_video->vfilter, mpctx->video_out, 1)) + usec_sleep(1000); + else usec_sleep(20000); } if (cmd && cmd->id == MP_CMD_PAUSE) { Index: mencoder.c =================================================================== --- mencoder.c (revision 27232) +++ mencoder.c (working copy) @@ -231,6 +231,11 @@ char *current_module; #include "mpcommon.h" +// Stubs for filters that might call these functions, to satisfy linker. These +// filters are probably only useful for mplayer. +void mp_input_add_cmd_filter(void *func, void* ctx) {} +void *mp_input_get_cmd(int time, int paused, int peek_only) { return NULL; } + //char *out_audio_codec=NULL; // override audio codec //char *out_video_codec=NULL; // override video codec