MagickCore  6.9.7
accelerate-kernels-private.h
Go to the documentation of this file.
1 /*
2  Copyright 1999-2017 ImageMagick Studio LLC, a non-profit organization
3  dedicated to making software imaging solutions freely available.
4 
5  You may not use this file except in compliance with the License.
6  obtain a copy of the License at
7 
8  http://www.imagemagick.org/script/license.php
9 
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15 
16  MagickCore private methods for accelerated functions.
17 */
18 
19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21 
22 #if defined(__cplusplus) || defined(c_plusplus)
23 extern "C" {
24 #endif
25 
26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
27 
28 /*
29  Define declarations.
30 */
31 #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
32 #define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n"
33 #define OPENCL_ELSE() "\n #""else " " \n"
34 #define OPENCL_ENDIF() "\n #""endif " " \n"
35 #define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n"
36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
37 
38 const char* accelerateKernels =
39 
40 /*
41  Define declarations.
42 */
43  OPENCL_DEFINE(GetPixelAlpha(pixel),(QuantumRange-(pixel).w))
44  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
45  OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
46  OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
47  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
48  OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
49  OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
50  OPENCL_DEFINE(SigmaRandom, (attenuate))
51  OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
52  OPENCL_DEFINE(MagickMax(x, y), (((x) > (y)) ? (x) : (y)))
53  OPENCL_DEFINE(MagickMin(x, y), (((x) < (y)) ? (x) : (y)))
54 
55 /*
56  Typedef declarations.
57 */
58  STRINGIFY(
59  typedef enum
60  {
62  RGBColorspace, /* Linear RGB colorspace */
63  GRAYColorspace, /* greyscale (linear) image (faked 1 channel) */
73  CMYKColorspace, /* negared linear RGB with black separated */
74  sRGBColorspace, /* Default: non-lienar sRGB colorspace */
83  CMYColorspace, /* negated linear RGB colorspace */
86  LCHColorspace, /* alias for LCHuv */
88  LCHabColorspace, /* Cylindrical (Polar) Lab */
89  LCHuvColorspace, /* Cylindrical (Polar) Luv */
92  HSVColorspace, /* alias for HSB */
96  )
97 
98  STRINGIFY(
99  typedef enum
100  {
156  /* These are new operators, added after the above was last sorted.
157  * The list should be re-sorted only when a new library version is
158  * created.
159  */
174  )
175 
176  STRINGIFY(
177  typedef enum
178  {
184  } MagickFunction;
185  )
186 
187  STRINGIFY(
188  typedef enum
189  {
191  UniformNoise,
194  ImpulseNoise,
196  PoissonNoise,
198  } NoiseType;
199  )
200 
201  STRINGIFY(
202  typedef enum
203  {
215  )
216 
217  STRINGIFY(
218  typedef enum {
236  )
237 
238  STRINGIFY(
239  typedef enum
240  {
242  RedChannel = 0x0001,
243  GrayChannel = 0x0001,
244  CyanChannel = 0x0001,
245  GreenChannel = 0x0002,
246  MagentaChannel = 0x0002,
247  BlueChannel = 0x0004,
248  YellowChannel = 0x0004,
249  AlphaChannel = 0x0008,
250  OpacityChannel = 0x0008,
251  MatteChannel = 0x0008, /* deprecated */
252  BlackChannel = 0x0020,
253  IndexChannel = 0x0020,
254  CompositeChannels = 0x002F,
255  AllChannels = 0x7ffffff,
256  /*
257  Special purpose channel types.
258  */
259  TrueAlphaChannel = 0x0040, /* extract actual alpha channel from opacity */
260  RGBChannels = 0x0080, /* set alpha from grayscale mask in RGB */
261  GrayChannels = 0x0080,
262  SyncChannels = 0x0100, /* channels should be modified equally */
264  } ChannelType;
265  )
266 
267 /*
268  Helper functions.
269 */
270 
271 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
272 
273  STRINGIFY(
274  inline CLQuantum ScaleCharToQuantum(const unsigned char value)
275  {
276  return((CLQuantum) value);
277  }
278  )
279 
280 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
281 
282  STRINGIFY(
283  inline CLQuantum ScaleCharToQuantum(const unsigned char value)
284  {
285  return((CLQuantum) (257.0f*value));
286  }
287  )
288 
289 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
290 
291  STRINGIFY(
292  inline CLQuantum ScaleCharToQuantum(const unsigned char value)
293  {
294  return((CLQuantum) (16843009.0*value));
295  }
296  )
297 
298 OPENCL_ENDIF()
299 
300 STRINGIFY(
301  inline int ClampToCanvas(const int offset, const int range)
302  {
303  return clamp(offset, (int)0, range - 1);
304  }
305  )
306 
307  STRINGIFY(
308  inline int ClampToCanvasWithHalo(const int offset, const int range, const int edge, const int section)
309  {
310  return clamp(offset, section ? (int)(0 - edge) : (int)0, section ? (range - 1) : (range - 1 + edge));
311  }
312  )
313 
314  STRINGIFY(
315  inline CLQuantum ClampToQuantum(const float value)
316  {
317  return (CLQuantum)(clamp(value, 0.0f, (float)QuantumRange) + 0.5f);
318  }
319  )
320 
321  STRINGIFY(
322  inline uint ScaleQuantumToMap(CLQuantum value)
323  {
324  if (value >= (CLQuantum)MaxMap)
325  return ((uint)MaxMap);
326  else
327  return ((uint)value);
328  }
329  )
330 
331  STRINGIFY(
332  inline float PerceptibleReciprocal(const float x)
333  {
334  float sign = x < (float) 0.0 ? (float)-1.0 : (float) 1.0;
335  return((sign*x) >= MagickEpsilon ? (float) 1.0 / x : sign*((float) 1.0 / MagickEpsilon));
336  }
337  )
338 
339  STRINGIFY(
340  inline float RoundToUnity(const float value)
341  {
342  return clamp(value, 0.0f, 1.0f);
343  }
344  )
345 
346  STRINGIFY(
347 
348  inline CLQuantum getBlue(CLPixelType p) { return p.x; }
349  inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
350  inline float getBlueF4(float4 p) { return p.x; }
351  inline void setBlueF4(float4* p, float value) { (*p).x = value; }
352 
353  inline CLQuantum getGreen(CLPixelType p) { return p.y; }
354  inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
355  inline float getGreenF4(float4 p) { return p.y; }
356  inline void setGreenF4(float4* p, float value) { (*p).y = value; }
357 
358  inline CLQuantum getRed(CLPixelType p) { return p.z; }
359  inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
360  inline float getRedF4(float4 p) { return p.z; }
361  inline void setRedF4(float4* p, float value) { (*p).z = value; }
362 
363  inline CLQuantum getOpacity(CLPixelType p) { return p.w; }
364  inline void setOpacity(CLPixelType* p, CLQuantum value) { (*p).w = value; }
365  inline float getOpacityF4(float4 p) { return p.w; }
366  inline void setOpacityF4(float4* p, float value) { (*p).w = value; }
367 
368  inline void setGray(CLPixelType* p, CLQuantum value) { (*p).z = value; (*p).y = value; (*p).x = value; }
369 
370  inline float GetPixelIntensity(const int method, const int colorspace, CLPixelType p)
371  {
372  float red = getRed(p);
373  float green = getGreen(p);
374  float blue = getBlue(p);
375 
376  float intensity;
377 
378  if (colorspace == GRAYColorspace)
379  return red;
380 
381  switch (method)
382  {
384  {
385  intensity = (red + green + blue) / 3.0;
386  break;
387  }
389  {
390  intensity = MagickMax(MagickMax(red, green), blue);
391  break;
392  }
394  {
395  intensity = (MagickMin(MagickMin(red, green), blue) +
396  MagickMax(MagickMax(red, green), blue)) / 2.0;
397  break;
398  }
400  {
401  intensity = (float)(((float)red*red + green*green + blue*blue) /
402  (3.0*QuantumRange));
403  break;
404  }
406  {
407  /*
408  if (image->colorspace == RGBColorspace)
409  {
410  red=EncodePixelGamma(red);
411  green=EncodePixelGamma(green);
412  blue=EncodePixelGamma(blue);
413  }
414  */
415  intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
416  break;
417  }
419  {
420  /*
421  if (image->colorspace == sRGBColorspace)
422  {
423  red=DecodePixelGamma(red);
424  green=DecodePixelGamma(green);
425  blue=DecodePixelGamma(blue);
426  }
427  */
428  intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
429  break;
430  }
432  default:
433  {
434  /*
435  if (image->colorspace == RGBColorspace)
436  {
437  red=EncodePixelGamma(red);
438  green=EncodePixelGamma(green);
439  blue=EncodePixelGamma(blue);
440  }
441  */
442  intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
443  break;
444  }
446  {
447  /*
448  if (image->colorspace == sRGBColorspace)
449  {
450  red=DecodePixelGamma(red);
451  green=DecodePixelGamma(green);
452  blue=DecodePixelGamma(blue);
453  }
454  */
455  intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
456  break;
457  }
459  {
460  intensity = (float)(sqrt((float)red*red + green*green + blue*blue) /
461  sqrt(3.0));
462  break;
463  }
464  }
465 
466  return intensity;
467 
468  }
469  )
470 
471 /*
472 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
473 % %
474 % %
475 % %
476 % A d d N o i s e %
477 % %
478 % %
479 % %
480 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
481 */
482 
483 STRINGIFY(
484 
485 /*
486 Part of MWC64X by David Thomas, dt10@imperial.ac.uk
487 This is provided under BSD, full license is with the main package.
488 See http://www.doc.ic.ac.uk/~dt10/research
489 */
490 
491 // Pre: a<M, b<M
492 // Post: r=(a+b) mod M
493 ulong MWC_AddMod64(ulong a, ulong b, ulong M)
494 {
495  ulong v=a+b;
496  //if( (v>=M) || (v<a) )
497  if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
498  v=v-M;
499  return v;
500 }
501 
502 // Pre: a<M,b<M
503 // Post: r=(a*b) mod M
504 // This could be done more efficently, but it is portable, and should
505 // be easy to understand. It can be replaced with any of the better
506 // modular multiplication algorithms (for example if you know you have
507 // double precision available or something).
508 ulong MWC_MulMod64(ulong a, ulong b, ulong M)
509 {
510  ulong r=0;
511  while(a!=0){
512  if(a&1)
513  r=MWC_AddMod64(r,b,M);
514  b=MWC_AddMod64(b,b,M);
515  a=a>>1;
516  }
517  return r;
518 }
519 
520 
521 // Pre: a<M, e>=0
522 // Post: r=(a^b) mod M
523 // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
524 // most architectures
525 ulong MWC_PowMod64(ulong a, ulong e, ulong M)
526 {
527  ulong sqr=a, acc=1;
528  while(e!=0){
529  if(e&1)
530  acc=MWC_MulMod64(acc,sqr,M);
531  sqr=MWC_MulMod64(sqr,sqr,M);
532  e=e>>1;
533  }
534  return acc;
535 }
536 
537 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
538 {
539  ulong m=MWC_PowMod64(A, distance, M);
540  ulong x=curr.x*(ulong)A+curr.y;
541  x=MWC_MulMod64(x, m, M);
542  return (uint2)((uint)(x/A), (uint)(x%A));
543 }
544 
545 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
546 {
547  // This is an arbitrary constant for starting LCG jumping from. I didn't
548  // want to start from 1, as then you end up with the two or three first values
549  // being a bit poor in ones - once you've decided that, one constant is as
550  // good as any another. There is no deep mathematical reason for it, I just
551  // generated a random number.
552  enum{ MWC_BASEID = 4077358422479273989UL };
553 
554  ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
555  ulong m=MWC_PowMod64(A, dist, M);
556 
557  ulong x=MWC_MulMod64(MWC_BASEID, m, M);
558  return (uint2)((uint)(x/A), (uint)(x%A));
559 }
560 
562 typedef struct{ uint x; uint c; } mwc64x_state_t;
563 
564 enum{ MWC64X_A = 4294883355U };
565 enum{ MWC64X_M = 18446383549859758079UL };
566 
567 void MWC64X_Step(mwc64x_state_t *s)
568 {
569  uint X=s->x, C=s->c;
570 
571  uint Xn=MWC64X_A*X+C;
572  uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
573  uint Cn=mad_hi(MWC64X_A,X,carry);
574 
575  s->x=Xn;
576  s->c=Cn;
577 }
578 
579 void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
580 {
581  uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
582  s->x=tmp.x;
583  s->c=tmp.y;
584 }
585 
586 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
587 {
588  uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
589  s->x=tmp.x;
590  s->c=tmp.y;
591 }
592 
594 uint MWC64X_NextUint(mwc64x_state_t *s)
595 {
596  uint res=s->x ^ s->c;
597  MWC64X_Step(s);
598  return res;
599 }
600 
601 //
602 // End of MWC64X excerpt
603 //
604 
605  float mwcReadPseudoRandomValue(mwc64x_state_t* rng) {
606  return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
607  }
608 
609 
610  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, CLQuantum pixel, NoiseType noise_type, float attenuate) {
611 
612  float
613  alpha,
614  beta,
615  noise,
616  sigma;
617 
618  noise = 0.0f;
619  alpha=mwcReadPseudoRandomValue(r);
620  switch(noise_type) {
621  case UniformNoise:
622  default:
623  {
624  noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
625  break;
626  }
627  case GaussianNoise:
628  {
629  float
630  gamma,
631  tau;
632 
633  if (alpha == 0.0f)
634  alpha=1.0f;
635  beta=mwcReadPseudoRandomValue(r);
636  gamma=sqrt(-2.0f*log(alpha));
637  sigma=gamma*cospi((2.0f*beta));
638  tau=gamma*sinpi((2.0f*beta));
639  noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+
641  break;
642  }
643 
644 
645  case ImpulseNoise:
646  {
647  if (alpha < (SigmaImpulse/2.0f))
648  noise=0.0f;
649  else
650  if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
651  noise=(float)QuantumRange;
652  else
653  noise=(float)pixel;
654  break;
655  }
656  case LaplacianNoise:
657  {
658  if (alpha <= 0.5f)
659  {
660  if (alpha <= MagickEpsilon)
661  noise=(float) (pixel-QuantumRange);
662  else
663  noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
664  0.5f);
665  break;
666  }
667  beta=1.0f-alpha;
668  if (beta <= (0.5f*MagickEpsilon))
669  noise=(float) (pixel+QuantumRange);
670  else
671  noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
672  break;
673  }
675  {
676  sigma=1.0f;
677  if (alpha > MagickEpsilon)
678  sigma=sqrt(-2.0f*log(alpha));
679  beta=mwcReadPseudoRandomValue(r);
680  noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma*
681  cospi((float) (2.0f*beta))/2.0f);
682  break;
683  }
684  case PoissonNoise:
685  {
686  float
687  poisson;
688  unsigned int i;
689  poisson=exp(-SigmaPoisson*QuantumScale*pixel);
690  for (i=0; alpha > poisson; i++)
691  {
692  beta=mwcReadPseudoRandomValue(r);
693  alpha*=beta;
694  }
695  noise=(float) (QuantumRange*i/SigmaPoisson);
696  break;
697  }
698  case RandomNoise:
699  {
700  noise=(float) (QuantumRange*SigmaRandom*alpha);
701  break;
702  }
703 
704  };
705  return noise;
706  }
707 
708  __kernel
709  void AddNoise(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage
710  ,const unsigned int inputPixelCount, const unsigned int pixelsPerWorkItem
711  ,const ChannelType channel
712  ,const NoiseType noise_type, const float attenuate
713  ,const unsigned int seed0, const unsigned int seed1
714  ,const unsigned int numRandomNumbersPerPixel) {
715 
716  mwc64x_state_t rng;
717  rng.x = seed0;
718  rng.c = seed1;
719 
720  uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
721  uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
722 
723  MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
724 
725  uint pos = get_local_size(0) * get_group_id(0) * pixelsPerWorkItem + get_local_id(0); // pixel to process
726 
727  uint count = pixelsPerWorkItem;
728 
729  while (count > 0) {
730  if (pos < inputPixelCount) {
731  CLPixelType p = inputImage[pos];
732 
733  if ((channel&RedChannel)!=0) {
734  setRed(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getRed(p),noise_type,attenuate)));
735  }
736 
737  if ((channel&GreenChannel)!=0) {
738  setGreen(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getGreen(p),noise_type,attenuate)));
739  }
740 
741  if ((channel&BlueChannel)!=0) {
742  setBlue(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getBlue(p),noise_type,attenuate)));
743  }
744 
745  if ((channel & OpacityChannel) != 0) {
746  setOpacity(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getOpacity(p),noise_type,attenuate)));
747  }
748 
749  filteredImage[pos] = p;
750  //filteredImage[pos] = (CLPixelType)(MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, 255);
751  }
752  pos += get_local_size(0);
753  --count;
754  }
755  }
756  )
757 
758 /*
759 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
760 % %
761 % %
762 % %
763 % B l u r %
764 % %
765 % %
766 % %
767 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
768 */
769 
770  STRINGIFY(
771  /*
772  Reduce image noise and reduce detail levels by row
773  im: input pixels filtered_in filtered_im: output pixels
774  filter : convolve kernel width: convolve kernel size
775  channel : define which channel is blured
776  is_RGBA_BGRA : define the input is RGBA or BGRA
777  */
778  __kernel void BlurRow(__global CLPixelType *im, __global float4 *filtered_im,
779  const ChannelType channel, __constant float *filter,
780  const unsigned int width,
781  const unsigned int imageColumns, const unsigned int imageRows,
782  __local CLPixelType *temp)
783  {
784  const int x = get_global_id(0);
785  const int y = get_global_id(1);
786 
787  const int columns = imageColumns;
788 
789  const unsigned int radius = (width-1)/2;
790  const int wsize = get_local_size(0);
791  const unsigned int loadSize = wsize+width;
792 
793  //load chunk only for now
794  //event_t e = async_work_group_copy(temp+radius, im+x+y*columns, wsize, 0);
795  //wait_group_events(1,&e);
796 
797  //parallel load and clamp
798  /*
799  int count = 0;
800  for (int i=0; i < loadSize; i=i+wsize)
801  {
802  int currentX = x + wsize*(count++);
803 
804  int localId = get_local_id(0);
805 
806  if ((localId+i) > loadSize)
807  break;
808 
809  temp[localId+i] = im[y*columns+ClampToCanvas(currentX-radius, columns)];
810 
811  if (y==0 && get_group_id(0) == 0)
812  {
813  printf("(%d %d) temp %d load %d currentX %d\n", x, y, localId+i, ClampToCanvas(currentX-radius, columns), currentX);
814  }
815  }
816  */
817 
818  //group coordinate
819  const int groupX=get_local_size(0)*get_group_id(0);
820  const int groupY=get_local_size(1)*get_group_id(1);
821 
822  //parallel load and clamp
823  for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
824  {
825  //int cx = ClampToCanvas(groupX+i, columns);
826  temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
827 
828  /*if (0 && y==0 && get_group_id(1) == 0)
829  {
830  printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
831  }*/
832  }
833 
834  // barrier
835  barrier(CLK_LOCAL_MEM_FENCE);
836 
837  // only do the work if this is not a patched item
838  if (get_global_id(0) < columns)
839  {
840  // compute
841  float4 result = (float4) 0;
842 
843  int i = 0;
844 
845  \n #ifndef UFACTOR \n
846  \n #define UFACTOR 8 \n
847  \n #endif \n
848 
849  for ( ; i+UFACTOR < width; )
850  {
851  \n #pragma unroll UFACTOR\n
852  for (int j=0; j < UFACTOR; j++, i++)
853  {
854  result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
855  }
856  }
857 
858  for ( ; i < width; i++)
859  {
860  result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
861  }
862 
863  result.x = ClampToQuantum(result.x);
864  result.y = ClampToQuantum(result.y);
865  result.z = ClampToQuantum(result.z);
866  result.w = ClampToQuantum(result.w);
867 
868  // write back to global
869  filtered_im[y*columns+x] = result;
870  }
871  }
872  )
873 
874  STRINGIFY(
875  /*
876  Reduce image noise and reduce detail levels by line
877  im: input pixels filtered_in filtered_im: output pixels
878  filter : convolve kernel width: convolve kernel size
879  channel : define which channel is blured\
880  is_RGBA_BGRA : define the input is RGBA or BGRA
881  */
882  __kernel void BlurColumn(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
883  const ChannelType channel, __constant float *filter,
884  const unsigned int width,
885  const unsigned int imageColumns, const unsigned int imageRows,
886  __local float4 *temp)
887  {
888  const int x = get_global_id(0);
889  const int y = get_global_id(1);
890 
891  //const int columns = get_global_size(0);
892  //const int rows = get_global_size(1);
893  const int columns = imageColumns;
894  const int rows = imageRows;
895 
896  unsigned int radius = (width-1)/2;
897  const int wsize = get_local_size(1);
898  const unsigned int loadSize = wsize+width;
899 
900  //group coordinate
901  const int groupX=get_local_size(0)*get_group_id(0);
902  const int groupY=get_local_size(1)*get_group_id(1);
903  //notice that get_local_size(0) is 1, so
904  //groupX=get_group_id(0);
905 
906  //parallel load and clamp
907  for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
908  {
909  temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
910  }
911 
912  // barrier
913  barrier(CLK_LOCAL_MEM_FENCE);
914 
915  // only do the work if this is not a patched item
916  if (get_global_id(1) < rows)
917  {
918  // compute
919  float4 result = (float4) 0;
920 
921  int i = 0;
922 
923  \n #ifndef UFACTOR \n
924  \n #define UFACTOR 8 \n
925  \n #endif \n
926 
927  for ( ; i+UFACTOR < width; )
928  {
929  \n #pragma unroll UFACTOR \n
930  for (int j=0; j < UFACTOR; j++, i++)
931  {
932  result+=filter[i]*temp[i+get_local_id(1)];
933  }
934  }
935 
936  for ( ; i < width; i++)
937  {
938  result+=filter[i]*temp[i+get_local_id(1)];
939  }
940 
941  result.x = ClampToQuantum(result.x);
942  result.y = ClampToQuantum(result.y);
943  result.z = ClampToQuantum(result.z);
944  result.w = ClampToQuantum(result.w);
945 
946  // write back to global
947  filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
948  }
949 
950  }
951  )
952 
953 /*
954 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
955 % %
956 % %
957 % %
958 % C o m p o s i t e %
959 % %
960 % %
961 % %
962 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
963 */
964 
965  STRINGIFY(
966  inline float ColorDodge(const float Sca,
967  const float Sa,const float Dca,const float Da)
968  {
969  /*
970  Oct 2004 SVG specification.
971  */
972  if ((Sca*Da+Dca*Sa) >= Sa*Da)
973  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
974  return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
975 
976 
977  /*
978  New specification, March 2009 SVG specification. This specification was
979  also wrong of non-overlap cases.
980  */
981  /*
982  if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
983  return(Sca*(1.0-Da));
984  if (fabs(Sca-Sa) < MagickEpsilon)
985  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
986  return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca)));
987  */
988 
989  /*
990  Working from first principles using the original formula:
991 
992  f(Sc,Dc) = Dc/(1-Sc)
993 
994  This works correctly! Looks like the 2004 model was right but just
995  required a extra condition for correct handling.
996  */
997 
998  /*
999  if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
1000  return(Sca*(1.0-Da)+Dca*(1.0-Sa));
1001  if (fabs(Sca-Sa) < MagickEpsilon)
1002  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
1003  return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
1004  */
1005  }
1006 
1007  inline void CompositeColorDodge(const float4 *p,
1008  const float4 *q,float4 *composite) {
1009 
1010  float
1011  Da,
1012  gamma,
1013  Sa;
1014 
1015  Sa=1.0f-QuantumScale*getOpacityF4(*p); /* simplify and speed up equations */
1016  Da=1.0f-QuantumScale*getOpacityF4(*q);
1017  gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */
1018  setOpacityF4(composite, QuantumRange*(1.0-gamma));
1019  gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma);
1020  setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale*
1021  getRedF4(*q)*Da,Da));
1022  setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale*
1023  getGreenF4(*q)*Da,Da));
1024  setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale*
1025  getBlueF4(*q)*Da,Da));
1026  }
1027  )
1028 
1029  STRINGIFY(
1030  inline void MagickPixelCompositePlus(const float4 *p,
1031  const float alpha,const float4 *q,
1032  const float beta,float4 *composite)
1033  {
1034  float
1035  gamma;
1036 
1037  float
1038  Da,
1039  Sa;
1040  /*
1041  Add two pixels with the given opacities.
1042  */
1043  Sa=1.0-QuantumScale*alpha;
1044  Da=1.0-QuantumScale*beta;
1045  gamma=RoundToUnity(Sa+Da); /* 'Plus' blending -- not 'Over' blending */
1046  setOpacityF4(composite,(float) QuantumRange*(1.0-gamma));
1047  gamma=PerceptibleReciprocal(gamma);
1048  setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q)));
1049  setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q)));
1050  setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q)));
1051  }
1052  )
1053 
1054  STRINGIFY(
1055  inline void MagickPixelCompositeBlend(const float4 *p,
1056  const float alpha,const float4 *q,
1057  const float beta,float4 *composite)
1058  {
1059  MagickPixelCompositePlus(p,(float) (QuantumRange-alpha*
1060  (QuantumRange-getOpacityF4(*p))),q,(float) (QuantumRange-beta*
1061  (QuantumRange-getOpacityF4(*q))),composite);
1062  }
1063  )
1064 
1065  STRINGIFY(
1066  __kernel
1067  void Composite(__global CLPixelType *image,
1068  const unsigned int imageWidth,
1069  const unsigned int imageHeight,
1070  const unsigned int imageMatte,
1071  const __global CLPixelType *compositeImage,
1072  const unsigned int compositeWidth,
1073  const unsigned int compositeHeight,
1074  const unsigned int compositeMatte,
1075  const unsigned int compose,
1076  const ChannelType channel,
1077  const float destination_dissolve,
1078  const float source_dissolve) {
1079 
1080  uint2 index;
1081  index.x = get_global_id(0);
1082  index.y = get_global_id(1);
1083 
1084 
1085  if (index.x >= imageWidth
1086  || index.y >= imageHeight) {
1087  return;
1088  }
1089  const CLPixelType inputPixel = image[index.y*imageWidth+index.x];
1090  float4 destination;
1091  setRedF4(&destination,getRed(inputPixel));
1092  setGreenF4(&destination,getGreen(inputPixel));
1093  setBlueF4(&destination,getBlue(inputPixel));
1094 
1095 
1096  const CLPixelType compositePixel
1097  = compositeImage[index.y*imageWidth+index.x];
1098  float4 source;
1099  setRedF4(&source,getRed(compositePixel));
1100  setGreenF4(&source,getGreen(compositePixel));
1101  setBlueF4(&source,getBlue(compositePixel));
1102 
1103  if (imageMatte != 0) {
1104  setOpacityF4(&destination,getOpacity(inputPixel));
1105  }
1106  else {
1107  setOpacityF4(&destination,0.0f);
1108  }
1109 
1110  if (compositeMatte != 0) {
1111  setOpacityF4(&source,getOpacity(compositePixel));
1112  }
1113  else {
1114  setOpacityF4(&source,0.0f);
1115  }
1116 
1117  float4 composite=destination;
1118 
1119  CompositeOperator op = (CompositeOperator)compose;
1120  switch (op) {
1121  case ColorDodgeCompositeOp:
1122  CompositeColorDodge(&source,&destination,&composite);
1123  break;
1124  case BlendCompositeOp:
1125  MagickPixelCompositeBlend(&source,source_dissolve,&destination,
1126  destination_dissolve,&composite);
1127  break;
1128  default:
1129  // unsupported operators
1130  break;
1131  };
1132 
1133  CLPixelType outputPixel;
1134  setRed(&outputPixel, ClampToQuantum(getRedF4(composite)));
1135  setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite)));
1136  setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite)));
1137  setOpacity(&outputPixel, ClampToQuantum(getOpacityF4(composite)));
1138  image[index.y*imageWidth+index.x] = outputPixel;
1139  }
1140  )
1141 
1142 /*
1143 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1144 % %
1145 % %
1146 % %
1147 % C o n t r a s t %
1148 % %
1149 % %
1150 % %
1151 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1152 */
1153 
1154  STRINGIFY(
1155 
1156  inline float3 ConvertRGBToHSB(CLPixelType pixel) {
1157  float3 HueSaturationBrightness;
1158  HueSaturationBrightness.x = 0.0f; // Hue
1159  HueSaturationBrightness.y = 0.0f; // Saturation
1160  HueSaturationBrightness.z = 0.0f; // Brightness
1161 
1162  float r=(float) getRed(pixel);
1163  float g=(float) getGreen(pixel);
1164  float b=(float) getBlue(pixel);
1165 
1166  float tmin=MagickMin(MagickMin(r,g),b);
1167  float tmax= MagickMax(MagickMax(r,g),b);
1168 
1169  if (tmax!=0.0f) {
1170  float delta=tmax-tmin;
1171  HueSaturationBrightness.y=delta/tmax;
1172  HueSaturationBrightness.z=QuantumScale*tmax;
1173 
1174  if (delta != 0.0f) {
1175  HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
1176  HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
1177  HueSaturationBrightness.x/=6.0f;
1178  HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
1179  }
1180  }
1181  return HueSaturationBrightness;
1182  }
1183 
1184  inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
1185 
1186  float hue = HueSaturationBrightness.x;
1187  float brightness = HueSaturationBrightness.z;
1188  float saturation = HueSaturationBrightness.y;
1189 
1190  CLPixelType rgb;
1191 
1192  if (saturation == 0.0f) {
1193  setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
1194  setGreen(&rgb,getRed(rgb));
1195  setBlue(&rgb,getRed(rgb));
1196  }
1197  else {
1198 
1199  float h=6.0f*(hue-floor(hue));
1200  float f=h-floor(h);
1201  float p=brightness*(1.0f-saturation);
1202  float q=brightness*(1.0f-saturation*f);
1203  float t=brightness*(1.0f-(saturation*(1.0f-f)));
1204 
1205  float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
1206  float clamped_t = ClampToQuantum(QuantumRange*t);
1207  float clamped_p = ClampToQuantum(QuantumRange*p);
1208  float clamped_q = ClampToQuantum(QuantumRange*q);
1209  int ih = (int)h;
1210  setRed(&rgb, (ih == 1)?clamped_q:
1211  (ih == 2 || ih == 3)?clamped_p:
1212  (ih == 4)?clamped_t:
1213  clampedBrightness);
1214 
1215  setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
1216  (ih == 3)?clamped_q:
1217  (ih == 4 || ih == 5)?clamped_p:
1218  clamped_t);
1219 
1220  setBlue(&rgb, (ih == 2)?clamped_t:
1221  (ih == 3 || ih == 4)?clampedBrightness:
1222  (ih == 5)?clamped_q:
1223  clamped_p);
1224  }
1225  return rgb;
1226  }
1227 
1228  __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
1229  {
1230 
1231  const int sign = sharpen!=0?1:-1;
1232  const int x = get_global_id(0);
1233  const int y = get_global_id(1);
1234  const int columns = get_global_size(0);
1235  const int c = x + y * columns;
1236 
1237  CLPixelType pixel = im[c];
1238  float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
1239  float brightness = HueSaturationBrightness.z;
1240  brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1241  brightness = clamp(brightness,0.0f,1.0f);
1242  HueSaturationBrightness.z = brightness;
1243 
1244  CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
1245  filteredPixel.w = pixel.w;
1246  im[c] = filteredPixel;
1247  }
1248  )
1249 
1250 /*
1251 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1252 % %
1253 % %
1254 % %
1255 % C o n t r a s t S t r e t c h %
1256 % %
1257 % %
1258 % %
1259 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1260 */
1261 
1262  STRINGIFY(
1263  /*
1264  */
1265  __kernel void Histogram(__global CLPixelType * restrict im,
1266  const ChannelType channel,
1267  const int method,
1268  const int colorspace,
1269  __global uint4 * restrict histogram)
1270  {
1271  const int x = get_global_id(0);
1272  const int y = get_global_id(1);
1273  const int columns = get_global_size(0);
1274  const int c = x + y * columns;
1275  if ((channel & SyncChannels) != 0)
1276  {
1277  float intensity = GetPixelIntensity(method, colorspace,im[c]);
1278  uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1279  atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1280  }
1281  else
1282  {
1283  // for equalizing, we always need all channels?
1284  // otherwise something more
1285  }
1286  }
1287  )
1288 
1289  STRINGIFY(
1290  /*
1291  */
1292  __kernel void ContrastStretch(__global CLPixelType * restrict im,
1293  const ChannelType channel,
1294  __global CLPixelType * restrict stretch_map,
1295  const float4 white, const float4 black)
1296  {
1297  const int x = get_global_id(0);
1298  const int y = get_global_id(1);
1299  const int columns = get_global_size(0);
1300  const int c = x + y * columns;
1301 
1302  uint ePos;
1303  CLPixelType oValue, eValue;
1304  CLQuantum red, green, blue, opacity;
1305 
1306  //read from global
1307  oValue=im[c];
1308 
1309  if ((channel & RedChannel) != 0)
1310  {
1311  if (getRedF4(white) != getRedF4(black))
1312  {
1313  ePos = ScaleQuantumToMap(getRed(oValue));
1314  eValue = stretch_map[ePos];
1315  red = getRed(eValue);
1316  }
1317  }
1318 
1319  if ((channel & GreenChannel) != 0)
1320  {
1321  if (getGreenF4(white) != getGreenF4(black))
1322  {
1323  ePos = ScaleQuantumToMap(getGreen(oValue));
1324  eValue = stretch_map[ePos];
1325  green = getGreen(eValue);
1326  }
1327  }
1328 
1329  if ((channel & BlueChannel) != 0)
1330  {
1331  if (getBlueF4(white) != getBlueF4(black))
1332  {
1333  ePos = ScaleQuantumToMap(getBlue(oValue));
1334  eValue = stretch_map[ePos];
1335  blue = getBlue(eValue);
1336  }
1337  }
1338 
1339  if ((channel & OpacityChannel) != 0)
1340  {
1341  if (getOpacityF4(white) != getOpacityF4(black))
1342  {
1343  ePos = ScaleQuantumToMap(getOpacity(oValue));
1344  eValue = stretch_map[ePos];
1345  opacity = getOpacity(eValue);
1346  }
1347  }
1348 
1349  //write back
1350  im[c]=(CLPixelType)(blue, green, red, opacity);
1351 
1352  }
1353  )
1354 
1355 /*
1356 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1357 % %
1358 % %
1359 % %
1360 % C o n v o l v e %
1361 % %
1362 % %
1363 % %
1364 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1365 */
1366 
1367  STRINGIFY(
1368  __kernel
1369  void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1370  const unsigned int imageWidth, const unsigned int imageHeight,
1371  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1372  const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1373 
1374  int2 blockID;
1375  blockID.x = get_group_id(0);
1376  blockID.y = get_group_id(1);
1377 
1378  // image area processed by this workgroup
1379  int2 imageAreaOrg;
1380  imageAreaOrg.x = blockID.x * get_local_size(0);
1381  imageAreaOrg.y = blockID.y * get_local_size(1);
1382 
1383  int2 midFilterDimen;
1384  midFilterDimen.x = (filterWidth-1)/2;
1385  midFilterDimen.y = (filterHeight-1)/2;
1386 
1387  int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1388 
1389  // dimension of the local cache
1390  int2 cachedAreaDimen;
1391  cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1392  cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1393 
1394  // cache the pixels accessed by this workgroup in local memory
1395  int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1396  int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1397  int groupSize = get_local_size(0) * get_local_size(1);
1398  for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1399 
1400  int2 cachedAreaIndex;
1401  cachedAreaIndex.x = i % cachedAreaDimen.x;
1402  cachedAreaIndex.y = i / cachedAreaDimen.x;
1403 
1404  int2 imagePixelIndex;
1405  imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1406 
1407  // only support EdgeVirtualPixelMethod through ClampToCanvas
1408  // TODO: implement other virtual pixel method
1409  imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1410  imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1411 
1412  pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1413  }
1414 
1415  // cache the filter
1416  for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1417  filterCache[i] = filter[i];
1418  }
1419  barrier(CLK_LOCAL_MEM_FENCE);
1420 
1421 
1422  int2 imageIndex;
1423  imageIndex.x = imageAreaOrg.x + get_local_id(0);
1424  imageIndex.y = imageAreaOrg.y + get_local_id(1);
1425 
1426  // if out-of-range, stops here and quit
1427  if (imageIndex.x >= imageWidth
1428  || imageIndex.y >= imageHeight) {
1429  return;
1430  }
1431 
1432  int filterIndex = 0;
1433  float4 sum = (float4)0.0f;
1434  float gamma = 0.0f;
1435  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
1436  int cacheIndexY = get_local_id(1);
1437  for (int j = 0; j < filterHeight; j++) {
1438  int cacheIndexX = get_local_id(0);
1439  for (int i = 0; i < filterWidth; i++) {
1440  CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1441  float f = filterCache[filterIndex];
1442 
1443  sum.x += f * p.x;
1444  sum.y += f * p.y;
1445  sum.z += f * p.z;
1446  sum.w += f * p.w;
1447 
1448  gamma += f;
1449  filterIndex++;
1450  cacheIndexX++;
1451  }
1452  cacheIndexY++;
1453  }
1454  }
1455  else {
1456  int cacheIndexY = get_local_id(1);
1457  for (int j = 0; j < filterHeight; j++) {
1458  int cacheIndexX = get_local_id(0);
1459  for (int i = 0; i < filterWidth; i++) {
1460 
1461  CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1462  float alpha = QuantumScale*(QuantumRange-p.w);
1463  float f = filterCache[filterIndex];
1464  float g = alpha * f;
1465 
1466  sum.x += g*p.x;
1467  sum.y += g*p.y;
1468  sum.z += g*p.z;
1469  sum.w += f*p.w;
1470 
1471  gamma += g;
1472  filterIndex++;
1473  cacheIndexX++;
1474  }
1475  cacheIndexY++;
1476  }
1477  gamma = PerceptibleReciprocal(gamma);
1478  sum.xyz = gamma*sum.xyz;
1479  }
1480  CLPixelType outputPixel;
1481  outputPixel.x = ClampToQuantum(sum.x);
1482  outputPixel.y = ClampToQuantum(sum.y);
1483  outputPixel.z = ClampToQuantum(sum.z);
1484  outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1485 
1486  output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1487  }
1488  )
1489 
1490  STRINGIFY(
1491  __kernel
1492  void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1493  const uint imageWidth, const uint imageHeight,
1494  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1495  const uint matte, const ChannelType channel) {
1496 
1497  int2 imageIndex;
1498  imageIndex.x = get_global_id(0);
1499  imageIndex.y = get_global_id(1);
1500 
1501  /*
1502  unsigned int imageWidth = get_global_size(0);
1503  unsigned int imageHeight = get_global_size(1);
1504  */
1505  if (imageIndex.x >= imageWidth
1506  || imageIndex.y >= imageHeight)
1507  return;
1508 
1509  int2 midFilterDimen;
1510  midFilterDimen.x = (filterWidth-1)/2;
1511  midFilterDimen.y = (filterHeight-1)/2;
1512 
1513  int filterIndex = 0;
1514  float4 sum = (float4)0.0f;
1515  float gamma = 0.0f;
1516  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
1517  for (int j = 0; j < filterHeight; j++) {
1518  int2 inputPixelIndex;
1519  inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1520  inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1521  for (int i = 0; i < filterWidth; i++) {
1522  inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1523  inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1524 
1525  CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1526  float f = filter[filterIndex];
1527 
1528  sum.x += f * p.x;
1529  sum.y += f * p.y;
1530  sum.z += f * p.z;
1531  sum.w += f * p.w;
1532 
1533  gamma += f;
1534 
1535  filterIndex++;
1536  }
1537  }
1538  }
1539  else {
1540 
1541  for (int j = 0; j < filterHeight; j++) {
1542  int2 inputPixelIndex;
1543  inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1544  inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1545  for (int i = 0; i < filterWidth; i++) {
1546  inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1547  inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1548 
1549  CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1550  float alpha = QuantumScale*(QuantumRange-p.w);
1551  float f = filter[filterIndex];
1552  float g = alpha * f;
1553 
1554  sum.x += g*p.x;
1555  sum.y += g*p.y;
1556  sum.z += g*p.z;
1557  sum.w += f*p.w;
1558 
1559  gamma += g;
1560 
1561 
1562  filterIndex++;
1563  }
1564  }
1565  gamma = PerceptibleReciprocal(gamma);
1566  sum.xyz = gamma*sum.xyz;
1567  }
1568 
1569  CLPixelType outputPixel;
1570  outputPixel.x = ClampToQuantum(sum.x);
1571  outputPixel.y = ClampToQuantum(sum.y);
1572  outputPixel.z = ClampToQuantum(sum.z);
1573  outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1574 
1575  output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1576  }
1577  )
1578 
1579 /*
1580 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1581 % %
1582 % %
1583 % %
1584 % D e s p e c k l e %
1585 % %
1586 % %
1587 % %
1588 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1589 */
1590 
1591  STRINGIFY(
1592 
1593  __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1594  , const unsigned int imageWidth, const unsigned int imageHeight
1595  , const int2 offset, const int polarity, const int matte) {
1596 
1597  int x = get_global_id(0);
1598  int y = get_global_id(1);
1599 
1600  CLPixelType v = inputImage[y*imageWidth+x];
1601 
1602  int2 neighbor;
1603  neighbor.y = y + offset.y;
1604  neighbor.x = x + offset.x;
1605 
1606  int2 clampedNeighbor;
1607  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1608  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1609 
1610  CLPixelType r = (clampedNeighbor.x == neighbor.x
1611  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1612  :(CLPixelType)0;
1613 
1614  int sv[4];
1615  sv[0] = (int)v.x;
1616  sv[1] = (int)v.y;
1617  sv[2] = (int)v.z;
1618  sv[3] = (int)v.w;
1619 
1620  int sr[4];
1621  sr[0] = (int)r.x;
1622  sr[1] = (int)r.y;
1623  sr[2] = (int)r.z;
1624  sr[3] = (int)r.w;
1625 
1626  if (polarity > 0) {
1627  \n #pragma unroll 4\n
1628  for (unsigned int i = 0; i < 4; i++) {
1629  sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1630  }
1631  }
1632  else {
1633  \n #pragma unroll 4\n
1634  for (unsigned int i = 0; i < 4; i++) {
1635  sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1636  }
1637 
1638  }
1639 
1640  v.x = (CLQuantum)sv[0];
1641  v.y = (CLQuantum)sv[1];
1642  v.z = (CLQuantum)sv[2];
1643 
1644  if (matte!=0)
1645  v.w = (CLQuantum)sv[3];
1646 
1647  outputImage[y*imageWidth+x] = v;
1648 
1649  }
1650 
1651 
1652  )
1653 
1654 
1655 
1656  STRINGIFY(
1657 
1658  __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1659  , const unsigned int imageWidth, const unsigned int imageHeight
1660  , const int2 offset, const int polarity, const int matte) {
1661 
1662  int x = get_global_id(0);
1663  int y = get_global_id(1);
1664 
1665  CLPixelType v = inputImage[y*imageWidth+x];
1666 
1667  int2 neighbor, clampedNeighbor;
1668 
1669  neighbor.y = y + offset.y;
1670  neighbor.x = x + offset.x;
1671  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1672  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1673 
1674  CLPixelType r = (clampedNeighbor.x == neighbor.x
1675  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1676  :(CLPixelType)0;
1677 
1678 
1679  neighbor.y = y - offset.y;
1680  neighbor.x = x - offset.x;
1681  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1682  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1683 
1684  CLPixelType s = (clampedNeighbor.x == neighbor.x
1685  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1686  :(CLPixelType)0;
1687 
1688 
1689  int sv[4];
1690  sv[0] = (int)v.x;
1691  sv[1] = (int)v.y;
1692  sv[2] = (int)v.z;
1693  sv[3] = (int)v.w;
1694 
1695  int sr[4];
1696  sr[0] = (int)r.x;
1697  sr[1] = (int)r.y;
1698  sr[2] = (int)r.z;
1699  sr[3] = (int)r.w;
1700 
1701  int ss[4];
1702  ss[0] = (int)s.x;
1703  ss[1] = (int)s.y;
1704  ss[2] = (int)s.z;
1705  ss[3] = (int)s.w;
1706 
1707  if (polarity > 0) {
1708  \n #pragma unroll 4\n
1709  for (unsigned int i = 0; i < 4; i++) {
1710  //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1711  //
1712  //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1713  //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1714  sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1715  }
1716  }
1717  else {
1718  \n #pragma unroll 4\n
1719  for (unsigned int i = 0; i < 4; i++) {
1720  //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1721  //
1722  //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1723  sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1724  }
1725  }
1726 
1727  v.x = (CLQuantum)sv[0];
1728  v.y = (CLQuantum)sv[1];
1729  v.z = (CLQuantum)sv[2];
1730 
1731  if (matte!=0)
1732  v.w = (CLQuantum)sv[3];
1733 
1734  outputImage[y*imageWidth+x] = v;
1735 
1736  }
1737 
1738 
1739  )
1740 
1741 /*
1742 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1743 % %
1744 % %
1745 % %
1746 % E q u a l i z e %
1747 % %
1748 % %
1749 % %
1750 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1751 */
1752 
1753  STRINGIFY(
1754  /*
1755  */
1756  __kernel void Equalize(__global CLPixelType * restrict im,
1757  const ChannelType channel,
1758  __global CLPixelType * restrict equalize_map,
1759  const float4 white, const float4 black)
1760  {
1761  const int x = get_global_id(0);
1762  const int y = get_global_id(1);
1763  const int columns = get_global_size(0);
1764  const int c = x + y * columns;
1765 
1766  uint ePos;
1767  CLPixelType oValue, eValue;
1768  CLQuantum red, green, blue, opacity;
1769 
1770  //read from global
1771  oValue=im[c];
1772 
1773  if ((channel & SyncChannels) != 0)
1774  {
1775  if (getRedF4(white) != getRedF4(black))
1776  {
1777  ePos = ScaleQuantumToMap(getRed(oValue));
1778  eValue = equalize_map[ePos];
1779  red = getRed(eValue);
1780  ePos = ScaleQuantumToMap(getGreen(oValue));
1781  eValue = equalize_map[ePos];
1782  green = getRed(eValue);
1783  ePos = ScaleQuantumToMap(getBlue(oValue));
1784  eValue = equalize_map[ePos];
1785  blue = getRed(eValue);
1786  ePos = ScaleQuantumToMap(getOpacity(oValue));
1787  eValue = equalize_map[ePos];
1788  opacity = getRed(eValue);
1789 
1790  //write back
1791  im[c]=(CLPixelType)(blue, green, red, opacity);
1792  }
1793 
1794  }
1795 
1796  // for equalizing, we always need all channels?
1797  // otherwise something more
1798 
1799  }
1800  )
1801 
1802 /*
1803 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1804 % %
1805 % %
1806 % %
1807 % F u n c t i o n %
1808 % %
1809 % %
1810 % %
1811 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1812 */
1813 
1814  STRINGIFY(
1815 
1816  /*
1817  apply FunctionImageChannel(braightness-contrast)
1818  */
1819  CLPixelType ApplyFunction(CLPixelType pixel,const MagickFunction function,
1820  const unsigned int number_parameters,
1821  __constant float *parameters)
1822  {
1823  float4 result = (float4) 0.0f;
1824  switch (function)
1825  {
1826  case PolynomialFunction:
1827  {
1828  for (unsigned int i=0; i < number_parameters; i++)
1829  result = result*(float4)QuantumScale*convert_float4(pixel) + parameters[i];
1830  result *= (float4)QuantumRange;
1831  break;
1832  }
1833  case SinusoidFunction:
1834  {
1835  float freq,phase,ampl,bias;
1836  freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1837  phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1838  ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1839  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1840  result.x = QuantumRange*(ampl*sin(2.0f*MagickPI*
1841  (freq*QuantumScale*(float)pixel.x + phase/360.0f)) + bias);
1842  result.y = QuantumRange*(ampl*sin(2.0f*MagickPI*
1843  (freq*QuantumScale*(float)pixel.y + phase/360.0f)) + bias);
1844  result.z = QuantumRange*(ampl*sin(2.0f*MagickPI*
1845  (freq*QuantumScale*(float)pixel.z + phase/360.0f)) + bias);
1846  result.w = QuantumRange*(ampl*sin(2.0f*MagickPI*
1847  (freq*QuantumScale*(float)pixel.w + phase/360.0f)) + bias);
1848  break;
1849  }
1850  case ArcsinFunction:
1851  {
1852  float width,range,center,bias;
1853  width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1854  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1855  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1856  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1857 
1858  result.x = 2.0f/width*(QuantumScale*(float)pixel.x - center);
1859  result.x = range/MagickPI*asin(result.x)+bias;
1860  result.x = ( result.x <= -1.0f ) ? bias - range/2.0f : result.x;
1861  result.x = ( result.x >= 1.0f ) ? bias + range/2.0f : result.x;
1862 
1863  result.y = 2.0f/width*(QuantumScale*(float)pixel.y - center);
1864  result.y = range/MagickPI*asin(result.y)+bias;
1865  result.y = ( result.y <= -1.0f ) ? bias - range/2.0f : result.y;
1866  result.y = ( result.y >= 1.0f ) ? bias + range/2.0f : result.y;
1867 
1868  result.z = 2.0f/width*(QuantumScale*(float)pixel.z - center);
1869  result.z = range/MagickPI*asin(result.z)+bias;
1870  result.z = ( result.z <= -1.0f ) ? bias - range/2.0f : result.x;
1871  result.z = ( result.z >= 1.0f ) ? bias + range/2.0f : result.x;
1872 
1873 
1874  result.w = 2.0f/width*(QuantumScale*(float)pixel.w - center);
1875  result.w = range/MagickPI*asin(result.w)+bias;
1876  result.w = ( result.w <= -1.0f ) ? bias - range/2.0f : result.w;
1877  result.w = ( result.w >= 1.0f ) ? bias + range/2.0f : result.w;
1878 
1879  result *= (float4)QuantumRange;
1880  break;
1881  }
1882  case ArctanFunction:
1883  {
1884  float slope,range,center,bias;
1885  slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1886  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1887  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1888  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1889  result = (float4)MagickPI*(float4)slope*((float4)QuantumScale*convert_float4(pixel)-(float4)center);
1890  result = (float4)QuantumRange*((float4)range/(float4)MagickPI*atan(result) + (float4)bias);
1891  break;
1892  }
1893  case UndefinedFunction:
1894  break;
1895  }
1896  return (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
1897  ClampToQuantum(result.z), ClampToQuantum(result.w));
1898  }
1899  )
1900 
1901  STRINGIFY(
1902  /*
1903  Improve brightness / contrast of the image
1904  channel : define which channel is improved
1905  function : the function called to enchance the brightness contrast
1906  number_parameters : numbers of parameters
1907  parameters : the parameter
1908  */
1909  __kernel void ComputeFunction(__global CLPixelType *im,
1910  const ChannelType channel, const MagickFunction function,
1911  const unsigned int number_parameters, __constant float *parameters)
1912  {
1913  const int x = get_global_id(0);
1914  const int y = get_global_id(1);
1915  const int columns = get_global_size(0);
1916  const int c = x + y * columns;
1917  im[c] = ApplyFunction(im[c], function, number_parameters, parameters);
1918  }
1919  )
1920 
1921 /*
1922 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1923 % %
1924 % %
1925 % %
1926 % G r a y s c a l e %
1927 % %
1928 % %
1929 % %
1930 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1931 */
1932 
1933  STRINGIFY(
1934  __kernel void Grayscale(__global CLPixelType *im,
1935  const int method, const int colorspace)
1936  {
1937 
1938  const int x = get_global_id(0);
1939  const int y = get_global_id(1);
1940  const int columns = get_global_size(0);
1941  const int c = x + y * columns;
1942 
1943  CLPixelType pixel = im[c];
1944 
1945  float
1946  blue,
1947  green,
1948  intensity,
1949  red;
1950 
1951  red=(float)getRed(pixel);
1952  green=(float)getGreen(pixel);
1953  blue=(float)getBlue(pixel);
1954 
1955  intensity=0.0;
1956 
1957  CLPixelType filteredPixel;
1958 
1959  switch (method)
1960  {
1962  {
1963  intensity=(red+green+blue)/3.0;
1964  break;
1965  }
1967  {
1968  intensity=MagickMax(MagickMax(red,green),blue);
1969  break;
1970  }
1972  {
1973  intensity=(MagickMin(MagickMin(red,green),blue)+
1974  MagickMax(MagickMax(red,green),blue))/2.0;
1975  break;
1976  }
1978  {
1979  intensity=(float) (((float) red*red+green*green+
1980  blue*blue)/(3.0*QuantumRange));
1981  break;
1982  }
1984  {
1985  /*
1986  if (colorspace == RGBColorspace)
1987  {
1988  red=EncodePixelGamma(red);
1989  green=EncodePixelGamma(green);
1990  blue=EncodePixelGamma(blue);
1991  }
1992  */
1993  intensity=0.298839*red+0.586811*green+0.114350*blue;
1994  break;
1995  }
1997  {
1998  /*
1999  if (image->colorspace == sRGBColorspace)
2000  {
2001  red=DecodePixelGamma(red);
2002  green=DecodePixelGamma(green);
2003  blue=DecodePixelGamma(blue);
2004  }
2005  */
2006  intensity=0.298839*red+0.586811*green+0.114350*blue;
2007  break;
2008  }
2010  default:
2011  {
2012  /*
2013  if (image->colorspace == RGBColorspace)
2014  {
2015  red=EncodePixelGamma(red);
2016  green=EncodePixelGamma(green);
2017  blue=EncodePixelGamma(blue);
2018  }
2019  */
2020  intensity=0.212656*red+0.715158*green+0.072186*blue;
2021  break;
2022  }
2024  {
2025  /*
2026  if (image->colorspace == sRGBColorspace)
2027  {
2028  red=DecodePixelGamma(red);
2029  green=DecodePixelGamma(green);
2030  blue=DecodePixelGamma(blue);
2031  }
2032  */
2033  intensity=0.212656*red+0.715158*green+0.072186*blue;
2034  break;
2035  }
2037  {
2038  intensity=(float) (sqrt((float) red*red+green*green+
2039  blue*blue)/sqrt(3.0));
2040  break;
2041  }
2042 
2043  }
2044 
2045  setGray(&filteredPixel, ClampToQuantum(intensity));
2046 
2047  filteredPixel.w = pixel.w;
2048 
2049  im[c] = filteredPixel;
2050  }
2051  )
2052 
2053 /*
2054 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2055 % %
2056 % %
2057 % %
2058 % L o c a l C o n t r a s t %
2059 % %
2060 % %
2061 % %
2062 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2063 */
2064 
2065  STRINGIFY(
2066  inline int mirrorBottom(int value)
2067  {
2068  return (value < 0) ? - (value) : value;
2069  }
2070  inline int mirrorTop(int value, int width)
2071  {
2072  return (value >= width) ? (2 * width - value - 1) : value;
2073  }
2074 
2075  __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
2076  const int radius,
2077  const int imageWidth,
2078  const int imageHeight)
2079  {
2080  const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
2081 
2082  int x = get_local_id(0);
2083  int y = get_global_id(1);
2084 
2085  if ((x >= imageWidth) || (y >= imageHeight))
2086  return;
2087 
2088  global CLPixelType *src = srcImage + y * imageWidth;
2089 
2090  for (int i = x; i < imageWidth; i += get_local_size(0)) {
2091  float sum = 0.0f;
2092  float weight = 1.0f;
2093 
2094  int j = i - radius;
2095  while ((j + 7) < i) {
2096  for (int k = 0; k < 8; ++k) // Unroll 8x
2097  sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
2098  weight += 8.0f;
2099  j+=8;
2100  }
2101  while (j < i) {
2102  sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
2103  weight += 1.0f;
2104  ++j;
2105  }
2106 
2107  while ((j + 7) < radius + i) {
2108  for (int k = 0; k < 8; ++k) // Unroll 8x
2109  sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
2110  weight -= 8.0f;
2111  j+=8;
2112  }
2113  while (j < radius + i) {
2114  sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
2115  weight -= 1.0f;
2116  ++j;
2117  }
2118 
2119  tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
2120  }
2121  }
2122  )
2123 
2124  STRINGIFY(
2125  __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
2126  const int radius,
2127  const float strength,
2128  const int imageWidth,
2129  const int imageHeight)
2130  {
2131  const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
2132 
2133  int x = get_global_id(0);
2134  int y = get_global_id(1);
2135 
2136  if ((x >= imageWidth) || (y >= imageHeight))
2137  return;
2138 
2139  global float *src = blurImage + x;
2140 
2141  float sum = 0.0f;
2142  float weight = 1.0f;
2143 
2144  int j = y - radius;
2145  while ((j + 7) < y) {
2146  for (int k = 0; k < 8; ++k) // Unroll 8x
2147  sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
2148  weight += 8.0f;
2149  j+=8;
2150  }
2151  while (j < y) {
2152  sum += weight * src[mirrorBottom(j) * imageWidth];
2153  weight += 1.0f;
2154  ++j;
2155  }
2156 
2157  while ((j + 7) < radius + y) {
2158  for (int k = 0; k < 8; ++k) // Unroll 8x
2159  sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
2160  weight -= 8.0f;
2161  j+=8;
2162  }
2163  while (j < radius + y) {
2164  sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
2165  weight -= 1.0f;
2166  ++j;
2167  }
2168 
2169  CLPixelType pixel = srcImage[x + y * imageWidth];
2170  float srcVal = dot(RGB, convert_float4(pixel));
2171  float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
2172  mult = (srcVal + mult) / srcVal;
2173 
2174  pixel.x = ClampToQuantum(pixel.x * mult);
2175  pixel.y = ClampToQuantum(pixel.y * mult);
2176  pixel.z = ClampToQuantum(pixel.z * mult);
2177 
2178  dstImage[x + y * imageWidth] = pixel;
2179  }
2180  )
2181 
2182 /*
2183 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2184 % %
2185 % %
2186 % %
2187 % M o d u l a t e %
2188 % %
2189 % %
2190 % %
2191 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2192 */
2193 
2194  STRINGIFY(
2195 
2196  inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
2197  float *hue, float *saturation, float *lightness)
2198  {
2199  float
2200  c,
2201  tmax,
2202  tmin;
2203 
2204  /*
2205  Convert RGB to HSL colorspace.
2206  */
2207  tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
2208  tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
2209 
2210  c=tmax-tmin;
2211 
2212  *lightness=(tmax+tmin)/2.0;
2213  if (c <= 0.0)
2214  {
2215  *hue=0.0;
2216  *saturation=0.0;
2217  return;
2218  }
2219 
2220  if (tmax == (QuantumScale*red))
2221  {
2222  *hue=(QuantumScale*green-QuantumScale*blue)/c;
2223  if ((QuantumScale*green) < (QuantumScale*blue))
2224  *hue+=6.0;
2225  }
2226  else
2227  if (tmax == (QuantumScale*green))
2228  *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
2229  else
2230  *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
2231 
2232  *hue*=60.0/360.0;
2233  if (*lightness <= 0.5)
2234  *saturation=c/(2.0*(*lightness));
2235  else
2236  *saturation=c/(2.0-2.0*(*lightness));
2237  }
2238 
2239  inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2240  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2241  {
2242  float
2243  b,
2244  c,
2245  g,
2246  h,
2247  tmin,
2248  r,
2249  x;
2250 
2251  /*
2252  Convert HSL to RGB colorspace.
2253  */
2254  h=hue*360.0;
2255  if (lightness <= 0.5)
2256  c=2.0*lightness*saturation;
2257  else
2258  c=(2.0-2.0*lightness)*saturation;
2259  tmin=lightness-0.5*c;
2260  h-=360.0*floor(h/360.0);
2261  h/=60.0;
2262  x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2263  switch ((int) floor(h) % 6)
2264  {
2265  case 0:
2266  {
2267  r=tmin+c;
2268  g=tmin+x;
2269  b=tmin;
2270  break;
2271  }
2272  case 1:
2273  {
2274  r=tmin+x;
2275  g=tmin+c;
2276  b=tmin;
2277  break;
2278  }
2279  case 2:
2280  {
2281  r=tmin;
2282  g=tmin+c;
2283  b=tmin+x;
2284  break;
2285  }
2286  case 3:
2287  {
2288  r=tmin;
2289  g=tmin+x;
2290  b=tmin+c;
2291  break;
2292  }
2293  case 4:
2294  {
2295  r=tmin+x;
2296  g=tmin;
2297  b=tmin+c;
2298  break;
2299  }
2300  case 5:
2301  {
2302  r=tmin+c;
2303  g=tmin;
2304  b=tmin+x;
2305  break;
2306  }
2307  default:
2308  {
2309  r=0.0;
2310  g=0.0;
2311  b=0.0;
2312  }
2313  }
2314  *red=ClampToQuantum(QuantumRange*r);
2315  *green=ClampToQuantum(QuantumRange*g);
2316  *blue=ClampToQuantum(QuantumRange*b);
2317  }
2318 
2319  inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2320  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2321  {
2322  float
2323  hue,
2324  lightness,
2325  saturation;
2326 
2327  /*
2328  Increase or decrease color lightness, saturation, or hue.
2329  */
2330  ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2331  hue+=0.5*(0.01*percent_hue-1.0);
2332  while (hue < 0.0)
2333  hue+=1.0;
2334  while (hue >= 1.0)
2335  hue-=1.0;
2336  saturation*=0.01*percent_saturation;
2337  lightness*=0.01*percent_lightness;
2338  ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2339  }
2340 
2341  __kernel void Modulate(__global CLPixelType *im,
2342  const float percent_brightness,
2343  const float percent_hue,
2344  const float percent_saturation,
2345  const int colorspace)
2346  {
2347 
2348  const int x = get_global_id(0);
2349  const int y = get_global_id(1);
2350  const int columns = get_global_size(0);
2351  const int c = x + y * columns;
2352 
2353  CLPixelType pixel = im[c];
2354 
2355  CLQuantum
2356  blue,
2357  green,
2358  red;
2359 
2360  red=getRed(pixel);
2361  green=getGreen(pixel);
2362  blue=getBlue(pixel);
2363 
2364  switch (colorspace)
2365  {
2366  case HSLColorspace:
2367  default:
2368  {
2369  ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2370  &red, &green, &blue);
2371  }
2372 
2373  }
2374 
2375  CLPixelType filteredPixel;
2376 
2377  setRed(&filteredPixel, red);
2378  setGreen(&filteredPixel, green);
2379  setBlue(&filteredPixel, blue);
2380  filteredPixel.w = pixel.w;
2381 
2382  im[c] = filteredPixel;
2383  }
2384  )
2385 
2386 /*
2387 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2388 % %
2389 % %
2390 % %
2391 % M o t i o n B l u r %
2392 % %
2393 % %
2394 % %
2395 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2396 */
2397 
2398  STRINGIFY(
2399  __kernel
2400  void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2401  const unsigned int imageWidth, const unsigned int imageHeight,
2402  const __global float *filter, const unsigned int width, const __global int2* offset,
2403  const float4 bias,
2404  const ChannelType channel, const unsigned int matte) {
2405 
2406  int2 currentPixel;
2407  currentPixel.x = get_global_id(0);
2408  currentPixel.y = get_global_id(1);
2409 
2410  if (currentPixel.x >= imageWidth
2411  || currentPixel.y >= imageHeight)
2412  return;
2413 
2414  float4 pixel;
2415  pixel.x = (float)bias.x;
2416  pixel.y = (float)bias.y;
2417  pixel.z = (float)bias.z;
2418  pixel.w = (float)bias.w;
2419 
2420  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
2421 
2422  for (int i = 0; i < width; i++) {
2423  // only support EdgeVirtualPixelMethod through ClampToCanvas
2424  // TODO: implement other virtual pixel method
2425  int2 samplePixel = currentPixel + offset[i];
2426  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2427  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2428  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2429 
2430  pixel.x += (filter[i] * (float)samplePixelValue.x);
2431  pixel.y += (filter[i] * (float)samplePixelValue.y);
2432  pixel.z += (filter[i] * (float)samplePixelValue.z);
2433  pixel.w += (filter[i] * (float)samplePixelValue.w);
2434  }
2435 
2436  CLPixelType outputPixel;
2437  outputPixel.x = ClampToQuantum(pixel.x);
2438  outputPixel.y = ClampToQuantum(pixel.y);
2439  outputPixel.z = ClampToQuantum(pixel.z);
2440  outputPixel.w = ClampToQuantum(pixel.w);
2441  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2442  }
2443  else {
2444 
2445  float gamma = 0.0f;
2446  for (int i = 0; i < width; i++) {
2447  // only support EdgeVirtualPixelMethod through ClampToCanvas
2448  // TODO: implement other virtual pixel method
2449  int2 samplePixel = currentPixel + offset[i];
2450  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2451  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2452 
2453  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2454 
2455  float alpha = QuantumScale*(QuantumRange-samplePixelValue.w);
2456  float k = filter[i];
2457  pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2458  pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2459  pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2460 
2461  pixel.w += k * alpha * samplePixelValue.w;
2462 
2463  gamma+=k*alpha;
2464  }
2465  gamma = PerceptibleReciprocal(gamma);
2466  pixel.xyz = gamma*pixel.xyz;
2467 
2468  CLPixelType outputPixel;
2469  outputPixel.x = ClampToQuantum(pixel.x);
2470  outputPixel.y = ClampToQuantum(pixel.y);
2471  outputPixel.z = ClampToQuantum(pixel.z);
2472  outputPixel.w = ClampToQuantum(pixel.w);
2473  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2474  }
2475  }
2476  )
2477 
2478 /*
2479 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2480 % %
2481 % %
2482 % %
2483 % R a d i a l B l u r %
2484 % %
2485 % %
2486 % %
2487 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2488 */
2489 
2490  STRINGIFY(
2491  __kernel void RadialBlur(const __global CLPixelType *im, __global CLPixelType *filtered_im,
2492  const float4 bias,
2493  const unsigned int channel, const unsigned int matte,
2494  const float2 blurCenter,
2495  __constant float *cos_theta, __constant float *sin_theta,
2496  const unsigned int cossin_theta_size)
2497  {
2498  const int x = get_global_id(0);
2499  const int y = get_global_id(1);
2500  const int columns = get_global_size(0);
2501  const int rows = get_global_size(1);
2502  unsigned int step = 1;
2503  float center_x = (float) x - blurCenter.x;
2504  float center_y = (float) y - blurCenter.y;
2505  float radius = hypot(center_x, center_y);
2506 
2507  //float blur_radius = hypot((float) columns/2.0f, (float) rows/2.0f);
2508  float blur_radius = hypot(blurCenter.x, blurCenter.y);
2509 
2510  if (radius > MagickEpsilon)
2511  {
2512  step = (unsigned int) (blur_radius / radius);
2513  if (step == 0)
2514  step = 1;
2515  if (step >= cossin_theta_size)
2516  step = cossin_theta_size-1;
2517  }
2518 
2519  float4 result;
2520  result.x = (float)bias.x;
2521  result.y = (float)bias.y;
2522  result.z = (float)bias.z;
2523  result.w = (float)bias.w;
2524  float normalize = 0.0f;
2525 
2526  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
2527  for (unsigned int i=0; i<cossin_theta_size; i+=step)
2528  {
2529  result += convert_float4(im[
2530  ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
2531  ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
2532  normalize += 1.0f;
2533  }
2534  normalize = PerceptibleReciprocal(normalize);
2535  result = result * normalize;
2536  }
2537  else {
2538  float gamma = 0.0f;
2539  for (unsigned int i=0; i<cossin_theta_size; i+=step)
2540  {
2541  float4 p = convert_float4(im[
2542  ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
2543  ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
2544 
2545  float alpha = (float)(QuantumScale*(QuantumRange-p.w));
2546  result.x += alpha * p.x;
2547  result.y += alpha * p.y;
2548  result.z += alpha * p.z;
2549  result.w += p.w;
2550  gamma+=alpha;
2551  normalize += 1.0f;
2552  }
2553  gamma = PerceptibleReciprocal(gamma);
2554  normalize = PerceptibleReciprocal(normalize);
2555  result.x = gamma*result.x;
2556  result.y = gamma*result.y;
2557  result.z = gamma*result.z;
2558  result.w = normalize*result.w;
2559  }
2560  filtered_im[y * columns + x] = (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
2561  ClampToQuantum(result.z), ClampToQuantum(result.w));
2562  }
2563  )
2564 
2565 /*
2566 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2567 % %
2568 % %
2569 % %
2570 % R e s i z e %
2571 % %
2572 % %
2573 % %
2574 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2575 */
2576 
2577  STRINGIFY(
2578  // Based on Box from resize.c
2579  float BoxResizeFilter(const float x)
2580  {
2581  return 1.0f;
2582  }
2583  )
2584 
2585  STRINGIFY(
2586  // Based on CubicBC from resize.c
2587  float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2588  {
2589  /*
2590  Cubic Filters using B,C determined values:
2591  Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter
2592  Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears
2593  Spline B = 1 C = 0 B-Spline Gaussian approximation
2594  Hermite B = 0 C = 0 B-Spline interpolator
2595 
2596  See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2597  Graphics Computer Graphics, Volume 22, Number 4, August 1988
2598  http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2599  Mitchell.pdf.
2600 
2601  Coefficents are determined from B,C values:
2602  P0 = ( 6 - 2*B )/6 = coeff[0]
2603  P1 = 0
2604  P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2605  P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2606  Q0 = ( 8*B +24*C )/6 = coeff[3]
2607  Q1 = ( -12*B -48*C )/6 = coeff[4]
2608  Q2 = ( 6*B +30*C )/6 = coeff[5]
2609  Q3 = ( - 1*B - 6*C )/6 = coeff[6]
2610 
2611  which are used to define the filter:
2612 
2613  P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1
2614  Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2
2615 
2616  which ensures function is continuous in value and derivative (slope).
2617  */
2618  if (x < 1.0)
2619  return(resizeFilterCoefficients[0]+x*(x*
2620  (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2621  if (x < 2.0)
2622  return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2623  (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2624  return(0.0);
2625  }
2626  )
2627 
2628  STRINGIFY(
2629  float Sinc(const float x)
2630  {
2631  if (x != 0.0f)
2632  {
2633  const float alpha=(float) (MagickPI*x);
2634  return sinpi(x)/alpha;
2635  }
2636  return(1.0f);
2637  }
2638  )
2639 
2640  STRINGIFY(
2641  float Triangle(const float x)
2642  {
2643  /*
2644  1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2645  a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function
2646  for Sinc().
2647  */
2648  return ((x<1.0f)?(1.0f-x):0.0f);
2649  }
2650  )
2651 
2652 
2653  STRINGIFY(
2654  float Hanning(const float x)
2655  {
2656  /*
2657  Cosine window function:
2658  0.5+0.5*cos(pi*x).
2659  */
2660  const float cosine=cos((MagickPI*x));
2661  return(0.5f+0.5f*cosine);
2662  }
2663  )
2664 
2665  STRINGIFY(
2666  float Hamming(const float x)
2667  {
2668  /*
2669  Offset cosine window function:
2670  .54 + .46 cos(pi x).
2671  */
2672  const float cosine=cos((MagickPI*x));
2673  return(0.54f+0.46f*cosine);
2674  }
2675  )
2676 
2677  STRINGIFY(
2678  float Blackman(const float x)
2679  {
2680  /*
2681  Blackman: 2nd order cosine windowing function:
2682  0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2683 
2684  Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2685  five flops.
2686  */
2687  const float cosine=cos((MagickPI*x));
2688  return(0.34f+cosine*(0.5f+cosine*0.16f));
2689  }
2690  )
2691 
2692 
2693 
2694 
2695  STRINGIFY(
2696  inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2697  {
2698  switch (filterType)
2699  {
2700  /* Call Sinc even for SincFast to get better precision on GPU
2701  and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/
2702  case SincWeightingFunction:
2704  return Sinc(x);
2706  return CubicBC(x,filterCoefficients);
2707  case BoxWeightingFunction:
2708  return BoxResizeFilter(x);
2710  return Triangle(x);
2712  return Hanning(x);
2714  return Hamming(x);
2716  return Blackman(x);
2717 
2718  default:
2719  return 0.0f;
2720  }
2721  }
2722  )
2723 
2724 
2725  STRINGIFY(
2726  inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2727  , const ResizeWeightingFunctionType resizeWindowType
2728  , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2729  {
2730  float scale;
2731  float xBlur = fabs(x/resizeFilterBlur);
2732  if (resizeWindowSupport < MagickEpsilon
2733  || resizeWindowType == BoxWeightingFunction)
2734  {
2735  scale = 1.0f;
2736  }
2737  else
2738  {
2739  scale = resizeFilterScale;
2740  scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2741  }
2742  float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2743  return weight;
2744  }
2745 
2746  )
2747 
2748  ;
2749  const char* accelerateKernels2 =
2750 
2751  STRINGIFY(
2752 
2753  inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2754  return (numWorkItems/pixelPerWorkgroup);
2755  }
2756 
2757  // returns the index of the pixel for the current workitem to compute.
2758  // returns -1 if this workitem doesn't need to participate in any computation
2759  inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2760  const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2761  int pixelIndex = itemID/numWorkItemsPerPixel;
2762  pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2763  return pixelIndex;
2764  }
2765 
2766  )
2767 
2768  STRINGIFY(
2769  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2770  void ResizeHorizontalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2771  , const float xFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2772  , const int resizeFilterType, const int resizeWindowType
2773  , const __global float* resizeFilterCubicCoefficients
2774  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2775  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2776  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2777 
2778 
2779  // calculate the range of resized image pixels computed by this workgroup
2780  const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2781  const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2782  const unsigned int actualNumPixelToCompute = stopX - startX;
2783 
2784  // calculate the range of input image pixels to cache
2785  float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2786  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2787  scale = PerceptibleReciprocal(scale);
2788 
2789  const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2790  const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2791 
2792  // cache the input pixels into local memory
2793  const unsigned int y = get_global_id(1);
2794  event_t e = async_work_group_copy(inputImageCache,inputImage+y*inputColumns+cacheRangeStartX,cacheRangeEndX-cacheRangeStartX,0);
2795  wait_group_events(1,&e);
2796 
2797  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2798  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2799  {
2800 
2801  const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2802  const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2803  const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2804 
2805  // determine which resized pixel computed by this workitem
2806  const unsigned int itemID = get_local_id(0);
2807  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2808 
2809  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2810 
2811  float4 filteredPixel = (float4)0.0f;
2812  float density = 0.0f;
2813  float gamma = 0.0f;
2814  // -1 means this workitem doesn't participate in the computation
2815  if (pixelIndex != -1) {
2816 
2817  // x coordinated of the resized pixel computed by this workitem
2818  const int x = chunkStartX + pixelIndex;
2819 
2820  // calculate how many steps required for this pixel
2821  const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2822  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2823  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2824  const unsigned int n = stop - start;
2825 
2826  // calculate how many steps this workitem will contribute
2827  unsigned int numStepsPerWorkItem = n / numItems;
2828  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2829 
2830  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2831  if (startStep < n) {
2832  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2833 
2834  unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2835  if (matte == 0) {
2836 
2837  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2838  float4 cp = convert_float4(inputImageCache[cacheIndex]);
2839 
2840  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2841  , (ResizeWeightingFunctionType)resizeWindowType
2842  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2843 
2844  filteredPixel += ((float4)weight)*cp;
2845  density+=weight;
2846  }
2847 
2848 
2849  }
2850  else {
2851  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2852  CLPixelType p = inputImageCache[cacheIndex];
2853 
2854  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2855  , (ResizeWeightingFunctionType)resizeWindowType
2856  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2857 
2858  float alpha = weight * QuantumScale * GetPixelAlpha(p);
2859  float4 cp = convert_float4(p);
2860 
2861  filteredPixel.x += alpha * cp.x;
2862  filteredPixel.y += alpha * cp.y;
2863  filteredPixel.z += alpha * cp.z;
2864  filteredPixel.w += weight * cp.w;
2865 
2866  density+=weight;
2867  gamma+=alpha;
2868  }
2869  }
2870  }
2871  }
2872 
2873  // initialize the accumulators to zero
2874  if (itemID < actualNumPixelInThisChunk) {
2875  outputPixelCache[itemID] = (float4)0.0f;
2876  densityCache[itemID] = 0.0f;
2877  if (matte != 0)
2878  gammaCache[itemID] = 0.0f;
2879  }
2880  barrier(CLK_LOCAL_MEM_FENCE);
2881 
2882  // accumulatte the filtered pixel value and the density
2883  for (unsigned int i = 0; i < numItems; i++) {
2884  if (pixelIndex != -1) {
2885  if (itemID%numItems == i) {
2886  outputPixelCache[pixelIndex]+=filteredPixel;
2887  densityCache[pixelIndex]+=density;
2888  if (matte!=0) {
2889  gammaCache[pixelIndex]+=gamma;
2890  }
2891  }
2892  }
2893  barrier(CLK_LOCAL_MEM_FENCE);
2894  }
2895 
2896  if (itemID < actualNumPixelInThisChunk) {
2897  if (matte==0) {
2898  float density = densityCache[itemID];
2899  float4 filteredPixel = outputPixelCache[itemID];
2900  if (density!= 0.0f && density != 1.0)
2901  {
2902  density = PerceptibleReciprocal(density);
2903  filteredPixel *= (float4)density;
2904  }
2905  filteredImage[y*filteredColumns+chunkStartX+itemID] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
2906  , ClampToQuantum(filteredPixel.y)
2907  , ClampToQuantum(filteredPixel.z)
2908  , ClampToQuantum(filteredPixel.w));
2909  }
2910  else {
2911  float density = densityCache[itemID];
2912  float gamma = gammaCache[itemID];
2913  float4 filteredPixel = outputPixelCache[itemID];
2914 
2915  if (density!= 0.0f && density != 1.0) {
2916  density = PerceptibleReciprocal(density);
2917  filteredPixel *= (float4)density;
2918  gamma *= density;
2919  }
2920  gamma = PerceptibleReciprocal(gamma);
2921 
2922  CLPixelType fp;
2923  fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
2924  , ClampToQuantum(gamma*filteredPixel.y)
2925  , ClampToQuantum(gamma*filteredPixel.z)
2926  , ClampToQuantum(filteredPixel.w));
2927 
2928  filteredImage[y*filteredColumns+chunkStartX+itemID] = fp;
2929 
2930  }
2931  }
2932 
2933  } // end of chunking loop
2934  }
2935  )
2936 
2937 
2938  STRINGIFY(
2939  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2940  void ResizeVerticalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2941  , const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2942  , const int resizeFilterType, const int resizeWindowType
2943  , const __global float* resizeFilterCubicCoefficients
2944  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2945  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2946  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2947 
2948 
2949  // calculate the range of resized image pixels computed by this workgroup
2950  const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2951  const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2952  const unsigned int actualNumPixelToCompute = stopY - startY;
2953 
2954  // calculate the range of input image pixels to cache
2955  float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2956  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2957  scale = PerceptibleReciprocal(scale);
2958 
2959  const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2960  const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2961 
2962  // cache the input pixels into local memory
2963  const unsigned int x = get_global_id(0);
2964  event_t e = async_work_group_strided_copy(inputImageCache, inputImage+cacheRangeStartY*inputColumns+x, cacheRangeEndY-cacheRangeStartY, inputColumns, 0);
2965  wait_group_events(1,&e);
2966 
2967  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2968  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2969  {
2970 
2971  const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2972  const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2973  const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2974 
2975  // determine which resized pixel computed by this workitem
2976  const unsigned int itemID = get_local_id(1);
2977  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2978 
2979  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2980 
2981  float4 filteredPixel = (float4)0.0f;
2982  float density = 0.0f;
2983  float gamma = 0.0f;
2984  // -1 means this workitem doesn't participate in the computation
2985  if (pixelIndex != -1) {
2986 
2987  // x coordinated of the resized pixel computed by this workitem
2988  const int y = chunkStartY + pixelIndex;
2989 
2990  // calculate how many steps required for this pixel
2991  const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2992  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2993  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2994  const unsigned int n = stop - start;
2995 
2996  // calculate how many steps this workitem will contribute
2997  unsigned int numStepsPerWorkItem = n / numItems;
2998  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2999 
3000  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
3001  if (startStep < n) {
3002  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
3003 
3004  unsigned int cacheIndex = start+startStep-cacheRangeStartY;
3005  if (matte == 0) {
3006 
3007  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
3008  float4 cp = convert_float4(inputImageCache[cacheIndex]);
3009 
3010  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
3011  , (ResizeWeightingFunctionType)resizeWindowType
3012  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
3013 
3014  filteredPixel += ((float4)weight)*cp;
3015  density+=weight;
3016  }
3017 
3018 
3019  }
3020  else {
3021  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
3022  CLPixelType p = inputImageCache[cacheIndex];
3023 
3024  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
3025  , (ResizeWeightingFunctionType)resizeWindowType
3026  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
3027 
3028  float alpha = weight * QuantumScale * GetPixelAlpha(p);
3029  float4 cp = convert_float4(p);
3030 
3031  filteredPixel.x += alpha * cp.x;
3032  filteredPixel.y += alpha * cp.y;
3033  filteredPixel.z += alpha * cp.z;
3034  filteredPixel.w += weight * cp.w;
3035 
3036  density+=weight;
3037  gamma+=alpha;
3038  }
3039  }
3040  }
3041  }
3042 
3043  // initialize the accumulators to zero
3044  if (itemID < actualNumPixelInThisChunk) {
3045  outputPixelCache[itemID] = (float4)0.0f;
3046  densityCache[itemID] = 0.0f;
3047  if (matte != 0)
3048  gammaCache[itemID] = 0.0f;
3049  }
3050  barrier(CLK_LOCAL_MEM_FENCE);
3051 
3052  // accumulatte the filtered pixel value and the density
3053  for (unsigned int i = 0; i < numItems; i++) {
3054  if (pixelIndex != -1) {
3055  if (itemID%numItems == i) {
3056  outputPixelCache[pixelIndex]+=filteredPixel;
3057  densityCache[pixelIndex]+=density;
3058  if (matte!=0) {
3059  gammaCache[pixelIndex]+=gamma;
3060  }
3061  }
3062  }
3063  barrier(CLK_LOCAL_MEM_FENCE);
3064  }
3065 
3066  if (itemID < actualNumPixelInThisChunk) {
3067  if (matte==0) {
3068  float density = densityCache[itemID];
3069  float4 filteredPixel = outputPixelCache[itemID];
3070  if (density!= 0.0f && density != 1.0)
3071  {
3072  density = PerceptibleReciprocal(density);
3073  filteredPixel *= (float4)density;
3074  }
3075  filteredImage[(chunkStartY+itemID)*filteredColumns+x] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
3076  , ClampToQuantum(filteredPixel.y)
3077  , ClampToQuantum(filteredPixel.z)
3078  , ClampToQuantum(filteredPixel.w));
3079  }
3080  else {
3081  float density = densityCache[itemID];
3082  float gamma = gammaCache[itemID];
3083  float4 filteredPixel = outputPixelCache[itemID];
3084 
3085  if (density!= 0.0f && density != 1.0) {
3086  density = PerceptibleReciprocal(density);
3087  filteredPixel *= (float4)density;
3088  gamma *= density;
3089  }
3090  gamma = PerceptibleReciprocal(gamma);
3091 
3092  CLPixelType fp;
3093  fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
3094  , ClampToQuantum(gamma*filteredPixel.y)
3095  , ClampToQuantum(gamma*filteredPixel.z)
3096  , ClampToQuantum(filteredPixel.w));
3097 
3098  filteredImage[(chunkStartY+itemID)*filteredColumns+x] = fp;
3099 
3100  }
3101  }
3102 
3103  } // end of chunking loop
3104  }
3105  )
3106 
3107 /*
3108 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3109 % %
3110 % %
3111 % %
3112 % U n s h a r p M a s k %
3113 % %
3114 % %
3115 % %
3116 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3117 */
3118 
3119  STRINGIFY(
3120  __kernel void UnsharpMaskBlurColumn(const __global CLPixelType* inputImage,
3121  const __global float4 *blurRowData, __global CLPixelType *filtered_im,
3122  const unsigned int imageColumns, const unsigned int imageRows,
3123  __local float4* cachedData, __local float* cachedFilter,
3124  const ChannelType channel, const __global float *filter, const unsigned int width,
3125  const float gain, const float threshold)
3126  {
3127  const unsigned int radius = (width-1)/2;
3128 
3129  // cache the pixel shared by the workgroup
3130  const int groupX = get_group_id(0);
3131  const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
3132  const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
3133 
3134  if (groupStartY >= 0
3135  && groupStopY < imageRows) {
3136  event_t e = async_work_group_strided_copy(cachedData
3137  ,blurRowData+groupStartY*imageColumns+groupX
3138  ,groupStopY-groupStartY,imageColumns,0);
3139  wait_group_events(1,&e);
3140  }
3141  else {
3142  for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
3143  cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,imageRows)*imageColumns+ groupX];
3144  }
3145  barrier(CLK_LOCAL_MEM_FENCE);
3146  }
3147  // cache the filter as well
3148  event_t e = async_work_group_copy(cachedFilter,filter,width,0);
3149  wait_group_events(1,&e);
3150 
3151  // only do the work if this is not a patched item
3152  //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
3153  const int cy = get_global_id(1);
3154 
3155  if (cy < imageRows) {
3156  float4 blurredPixel = (float4) 0.0f;
3157 
3158  int i = 0;
3159 
3160  \n #ifndef UFACTOR \n
3161  \n #define UFACTOR 8 \n
3162  \n #endif \n
3163 
3164  for ( ; i+UFACTOR < width; )
3165  {
3166  \n #pragma unroll UFACTOR \n
3167  for (int j=0; j < UFACTOR; j++, i++)
3168  {
3169  blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
3170  }
3171  }
3172 
3173  for ( ; i < width; i++)
3174  {
3175  blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
3176  }
3177 
3178  blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
3179  ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
3180 
3181  float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
3182  float4 outputPixel = inputImagePixel - blurredPixel;
3183 
3184  float quantumThreshold = QuantumRange*threshold;
3185 
3186  int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
3187  outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
3188 
3189  //write back
3190  filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
3191  ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
3192 
3193  }
3194  }
3195  )
3196 
3197 
3198 
3199  STRINGIFY(
3200  __kernel void UnsharpMask(__global CLPixelType *im, __global CLPixelType *filtered_im,
3201  __constant float *filter,
3202  const unsigned int width,
3203  const unsigned int imageColumns, const unsigned int imageRows,
3204  __local float4 *pixels,
3205  const float gain, const float threshold, const unsigned int justBlur)
3206  {
3207  const int x = get_global_id(0);
3208  const int y = get_global_id(1);
3209 
3210  const unsigned int radius = (width - 1) / 2;
3211 
3212  int row = y - radius;
3213  int baseRow = get_group_id(1) * get_local_size(1) - radius;
3214  int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
3215 
3216  while (row < endRow) {
3217  int srcy = (row < 0) ? -row : row; // mirror pad
3218  srcy = (srcy >= imageRows) ? (2 * imageRows - srcy - 1) : srcy;
3219 
3220  float4 value = 0.0f;
3221 
3222  int ix = x - radius;
3223  int i = 0;
3224 
3225  while (i + 7 < width) {
3226  for (int j = 0; j < 8; ++j) { // unrolled
3227  int srcx = ix + j;
3228  srcx = (srcx < 0) ? -srcx : srcx;
3229  srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
3230  value += filter[i + j] * convert_float4(im[srcx + srcy * imageColumns]);
3231  }
3232  ix += 8;
3233  i += 8;
3234  }
3235 
3236  while (i < width) {
3237  int srcx = (ix < 0) ? -ix : ix; // mirror pad
3238  srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
3239  value += filter[i] * convert_float4(im[srcx + srcy * imageColumns]);
3240  ++i;
3241  ++ix;
3242  }
3243  pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
3244  row += get_local_size(1);
3245  }
3246 
3247 
3248  barrier(CLK_LOCAL_MEM_FENCE);
3249 
3250 
3251  const int px = get_local_id(0);
3252  const int py = get_local_id(1);
3253  const int prp = get_local_size(0);
3254  float4 value = (float4)(0.0f);
3255 
3256  int i = 0;
3257  while (i + 7 < width) { // unrolled
3258  value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3259  value += (float4)(filter[i]) * pixels[px + (py + i + 1) * prp];
3260  value += (float4)(filter[i]) * pixels[px + (py + i + 2) * prp];
3261  value += (float4)(filter[i]) * pixels[px + (py + i + 3) * prp];
3262  value += (float4)(filter[i]) * pixels[px + (py + i + 4) * prp];
3263  value += (float4)(filter[i]) * pixels[px + (py + i + 5) * prp];
3264  value += (float4)(filter[i]) * pixels[px + (py + i + 6) * prp];
3265  value += (float4)(filter[i]) * pixels[px + (py + i + 7) * prp];
3266  i += 8;
3267  }
3268  while (i < width) {
3269  value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3270  ++i;
3271  }
3272  if ((x < imageColumns) && (y < imageRows)) {
3273  if (justBlur == 0) { // apply sharpening
3274  float4 srcPixel = convert_float4(im[x + y * imageColumns]);
3275  float4 diff = srcPixel - value;
3276 
3277  float quantumThreshold = QuantumRange*threshold;
3278 
3279  int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3280  value = select(srcPixel + diff * gain, srcPixel, mask);
3281  }
3282  filtered_im[x + y * imageColumns] = (CLPixelType)(ClampToQuantum(value.s0), ClampToQuantum(value.s1), ClampToQuantum(value.s2), ClampToQuantum(value.s3));
3283  }
3284  }
3285  )
3286 
3287  STRINGIFY(
3288  __kernel __attribute__((reqd_work_group_size(64, 4, 1))) void WaveletDenoise(__global CLPixelType *srcImage, __global CLPixelType *dstImage,
3289  const float threshold,
3290  const int passes,
3291  const int imageWidth,
3292  const int imageHeight)
3293  {
3294  const int pad = (1 << (passes - 1));
3295  const int tileSize = 64;
3296  const int tileRowPixels = 64;
3297  const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3298 
3299  CLPixelType stage[16];
3300 
3301  local float buffer[64 * 64];
3302 
3303  int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
3304  int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
3305 
3306  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3307  stage[i / 4] = srcImage[mirrorTop(mirrorBottom(srcx), imageWidth) + (mirrorTop(mirrorBottom(srcy + i) , imageHeight)) * imageWidth];
3308  }
3309 
3310 
3311  for (int channel = 0; channel < 3; ++channel) {
3312  // Load LDS
3313  switch (channel) {
3314  case 0:
3315  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3316  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s0);
3317  break;
3318  case 1:
3319  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3320  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s1);
3321  break;
3322  case 2:
3323  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3324  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s2);
3325  break;
3326  }
3327 
3328 
3329  // Process
3330 
3331  float tmp[16];
3332  float accum[16];
3333  float pixel;
3334 
3335  for (int pass = 0; pass < passes; ++pass) {
3336  const int radius = 1 << pass;
3337  const int x = get_local_id(0);
3338  const float thresh = threshold * noise[pass];
3339 
3340  if (pass == 0)
3341  accum[0] = accum[1] = accum[2] = accum[3] = accum[4] = accum[5] = accum[6] = accum[6] = accum[7] = accum[8] = accum[9] = accum[10] = accum[11] = accum[12] = accum[13] = accum[14] = accum[15] = 0.0f;
3342 
3343  // Snapshot input
3344 
3345  // Apply horizontal hat
3346  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3347  const int offset = i * tileRowPixels;
3348  if (pass == 0)
3349  tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3350  pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3351  barrier(CLK_LOCAL_MEM_FENCE);
3352  buffer[x + offset] = pixel;
3353  }
3354  barrier(CLK_LOCAL_MEM_FENCE);
3355  // Apply vertical hat
3356  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3357  pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3358  float delta = tmp[i / 4] - pixel;
3359  tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3360  if (delta < -thresh)
3361  delta += thresh;
3362  else if (delta > thresh)
3363  delta -= thresh;
3364  else
3365  delta = 0;
3366  accum[i / 4] += delta;
3367 
3368  }
3369  barrier(CLK_LOCAL_MEM_FENCE);
3370  if (pass < passes - 1)
3371  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3372  buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3373  else // last pass
3374  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3375  accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3376  barrier(CLK_LOCAL_MEM_FENCE);
3377  }
3378 
3379  switch (channel) {
3380  case 0:
3381  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3382  stage[i / 4].s0 = ClampToQuantum(accum[i / 4]);
3383  break;
3384  case 1:
3385  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3386  stage[i / 4].s1 = ClampToQuantum(accum[i / 4]);
3387  break;
3388  case 2:
3389  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3390  stage[i / 4].s2 = ClampToQuantum(accum[i / 4]);
3391  break;
3392  }
3393 
3394  barrier(CLK_LOCAL_MEM_FENCE);
3395  }
3396 
3397  // Write from stage to output
3398 
3399  if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3400  //for (int i = pad + get_local_id(1); i < tileSize - pad; i += get_local_size(1)) {
3401  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3402  if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3403  dstImage[srcx + (srcy + i) * imageWidth] = stage[i / 4];
3404  }
3405  }
3406  }
3407  }
3408  )
3409  ;
3410 
3411 #endif // MAGICKCORE_OPENCL_SUPPORT
3412 
3413 #if defined(__cplusplus) || defined(c_plusplus)
3414 }
3415 #endif
3416 
3417 #endif // MAGICKCORE_ACCELERATE_PRIVATE_H
Definition: composite.h:91
Definition: composite.h:94
Definition: composite.h:65
Definition: colorspace.h:44
Definition: resize-private.h:31
Definition: colorspace.h:36
#define SigmaPoisson
Definition: resize-private.h:37
Definition: pixel.h:75
Definition: statistic.h:116
Definition: resize-private.h:33
Definition: magick-type.h:202
Definition: composite.h:75
Definition: pixel.h:72
Definition: colorspace.h:40
static void MagickPixelCompositeBlend(const MagickPixelPacket *p, const MagickRealType alpha, const MagickPixelPacket *q, const MagickRealType beta, MagickPixelPacket *composite)
Definition: composite-private.h:138
Definition: composite.h:31
Definition: composite.h:93
Definition: colorspace.h:45
Definition: colorspace.h:33
Definition: composite.h:80
#define SigmaRandom
Definition: composite.h:33
Definition: resize-private.h:40
Definition: composite.h:90
Definition: resize-private.h:29
static MagickRealType ColorDodge(const MagickRealType Sca, const MagickRealType Sa, const MagickRealType Dca, const MagickRealType Da)
Definition: composite.c:293
Definition: fx.h:34
PixelIntensityMethod
Definition: pixel.h:67
Definition: magick-type.h:191
Definition: composite.h:95
Definition: colorspace.h:59
Definition: magick-type.h:197
Definition: composite.h:59
Definition: composite.h:89
Definition: magick-type.h:186
Definition: composite.h:27
Definition: colorspace.h:41
Definition: colorspace.h:37
static MagickRealType RoundToUnity(const MagickRealType value)
Definition: composite-private.h:33
Definition: composite.h:35
Definition: composite.h:87
#define MagickPI
Definition: image-private.h:28
Definition: colorspace.h:58
Definition: colorspace.h:50
static MagickRealType Hanning(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:287
Definition: colorspace.h:47
Definition: fx.h:29
Definition: statistic.h:115
Definition: colorspace.h:31
#define MAGICKCORE_QUANTUM_DEPTH
Definition: magick-type.h:28
Definition: composite.h:53
Definition: colorspace.h:35
Definition: resize-private.h:38
Definition: pixel.h:77
#define MagickEpsilon
Definition: magick-type.h:143
#define SigmaLaplacian
MagickExport void ConvertRGBToHSL(const Quantum red, const Quantum green, const Quantum blue, double *hue, double *saturation, double *lightness)
Definition: gem.c:1127
Definition: magick-type.h:192
Definition: colorspace.h:48
static Quantum ClampToQuantum(const MagickRealType quantum)
Definition: quantum.h:88
Definition: statistic.h:117
Definition: magick-type.h:204
NoiseType
Definition: fx.h:27
Definition: colorspace.h:52
Definition: composite.h:47
static MagickRealType Hamming(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:301
Definition: resize-private.h:41
Definition: composite.h:73
Definition: composite.h:29
Definition: composite.h:72
Definition: composite.h:42
Definition: colorspace.h:43
#define SigmaUniform
Definition: composite.h:97
static void ModulateHSL(const double percent_hue, const double percent_saturation, const double percent_lightness, Quantum *red, Quantum *green, Quantum *blue)
Definition: enhance.c:3541
Definition: colorspace.h:34
Definition: colorspace.h:57
Definition: resize-private.h:30
static double PerceptibleReciprocal(const double x)
Definition: pixel-accessor.h:124
Definition: composite.h:54
#define GetPixelAlpha(pixel)
Definition: pixel-accessor.h:36
Definition: composite.h:38
Definition: composite.h:68
Definition: composite.h:96
Definition: magick-type.h:188
Definition: composite.h:71
Definition: resize-private.h:32
Definition: composite.h:55
Definition: composite.h:56
Definition: fx.h:31
#define SigmaGaussian
Definition: composite.h:69
Definition: pixel.h:71
static Quantum ApplyFunction(Quantum pixel, const MagickFunction function, const size_t number_parameters, const double *parameters, ExceptionInfo *exception)
Definition: statistic.c:962
Definition: colorspace.h:38
Definition: pixel.h:70
Definition: composite.h:86
Definition: resize-private.h:36
Definition: colorspace.h:30
#define SigmaMultiplicativeGaussian
Definition: composite.h:49
Definition: composite.h:44
#define TauGaussian
MagickExport void ConvertRGBToHSB(const Quantum red, const Quantum green, const Quantum blue, double *hue, double *saturation, double *brightness)
Definition: gem.c:994
Definition: magick-type.h:190
static void Contrast(const int sign, Quantum *red, Quantum *green, Quantum *blue)
Definition: enhance.c:913
Definition: magick-type.h:205
Definition: composite.h:46
Definition: statistic.h:113
Definition: composite.h:28
Definition: magick-type.h:185
Definition: magick-type.h:194
Definition: colorspace.h:54
Definition: magick-type.h:193
Definition: resize-private.h:39
Definition: composite.h:78
Definition: resize-private.h:34
#define QuantumScale
Definition: magick-type.h:146
Definition: colorspace.h:55
Definition: fx.h:33
Definition: composite.h:62
Definition: colorspace.h:39
#define MaxMap
Definition: magick-type.h:74
Definition: magick-type.h:201
#define MagickMax(x, y)
Definition: image-private.h:26
Definition: composite.h:98
Definition: composite.h:39
static void CompositeColorDodge(const MagickPixelPacket *p, const MagickPixelPacket *q, MagickPixelPacket *composite)
Definition: composite.c:330
MagickExport void ConvertHSBToRGB(const double hue, const double saturation, const double brightness, Quantum *red, Quantum *green, Quantum *blue)
Definition: gem.c:284
Definition: composite.h:45
ChannelType
Definition: magick-type.h:181
Definition: composite.h:70
Definition: colorspace.h:46
Definition: resize-private.h:28
Definition: composite.h:81
Definition: composite.h:41
Definition: composite.h:52
Definition: pixel.h:69
Definition: colorspace.h:49
MagickExport void ConvertHSLToRGB(const double hue, const double saturation, const double lightness, Quantum *red, Quantum *green, Quantum *blue)
Definition: gem.c:460
Definition: composite.h:77
Definition: colorspace.h:53
Definition: composite.h:61
Definition: magick-type.h:187
static void MagickPixelCompositePlus(const MagickPixelPacket *p, const MagickRealType alpha, const MagickPixelPacket *q, const MagickRealType beta, MagickPixelPacket *composite)
Definition: composite-private.h:111
Definition: composite.h:76
Definition: magick-type.h:183
Definition: colorspace.h:28
Definition: resize-private.h:42
Definition: composite.h:50
Definition: composite.h:36
Definition: composite.h:43
MagickExport MagickRealType GetPixelIntensity(const Image *image, const PixelPacket *magick_restrict pixel)
Definition: pixel.c:2285
static MagickRealType Sinc(const MagickRealType, const ResizeFilter *)
Definition: composite.h:37
Definition: composite.h:60
Definition: statistic.h:114
ResizeWeightingFunctionType
Definition: resize-private.h:25
static MagickRealType Blackman(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:148
Definition: colorspace.h:56
#define MagickMin(x, y)
Definition: image-private.h:27
ColorspaceType
Definition: colorspace.h:25
Definition: composite.h:32
Definition: colorspace.h:29
Definition: composite.h:88
Definition: colorspace.h:42
#define SigmaImpulse
Definition: composite.h:48
Definition: composite.h:64
Definition: magick-type.h:189
Definition: colorspace.h:51
CompositeOperator
Definition: composite.h:25
Definition: composite.h:79
Definition: magick-type.h:196
Definition: colorspace.h:32
Definition: pixel.h:78
Definition: composite.h:66
Definition: composite.h:30
Definition: colorspace.h:60
Definition: magick-type.h:184
Definition: composite.h:63
Definition: composite.h:58
Definition: composite.h:92
Definition: magick-type.h:203
Definition: composite.h:34
static MagickRealType CubicBC(const MagickRealType x, const ResizeFilter *resize_filter)
Definition: resize.c:210
Definition: resize-private.h:27
Definition: composite.h:74
Definition: colorspace.h:27
MagickFunction
Definition: statistic.h:111
Definition: fx.h:30
Definition: composite.h:40
Definition: composite.h:67
Definition: resize-private.h:35
#define QuantumRange
Definition: magick-type.h:98
static MagickRealType Triangle(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:514
Definition: fx.h:35
Definition: pixel.h:73
Definition: composite.h:51
Definition: fx.h:32
Definition: magick-type.h:195
Definition: fx.h:36
Definition: composite.h:57