Logo Search packages:      
Sourcecode: fftw version File versions  Download package

rexec_threads.c

/*
 * Copyright (c) 1997-1999 Massachusetts Institute of Technology
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

/*
 * rexec_threads.c -- execute the fft in parallel
 */

#include <stdio.h>
#include <stdlib.h>

#include <fftw_threads-int.h>
#include <rfftw_threads.h>

extern void rfftw_strided_copy(int n, fftw_real *in, int ostride,
                         fftw_real *out);

static void rexec_simple_threads(int n, fftw_real *in,
                         fftw_real *out,
                         fftw_plan_node *p,
                         int istride,
                         int ostride,
                         int nthreads);

typedef struct {
     int m,r;
     fftw_real *in;
     fftw_real *out;
     fftw_plan_node *p;
     int istride, ostride;
     int nthreads;
} rexec_simple_data;

static void *rexec_simple_thread_r2c(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     rexec_simple_data *d = (rexec_simple_data *) ldata->data;
     int m = d->m, r = d->r;
     fftw_real *in = d->in;
     fftw_real *out = d->out;
     fftw_plan_node *p = d->p;
     int istride = d->istride, ostride = d->ostride;
     int nthreads = d->nthreads;
     
     for (; min < max; ++min)
        rexec_simple_threads(m, in + min * istride,
                         out + min * (m * ostride),
                         p,
                         istride * r, ostride,
                         nthreads);

     return 0;
}

static void *rexec_simple_thread_c2r(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     rexec_simple_data *d = (rexec_simple_data *) ldata->data;
     int m = d->m, r = d->r;
     fftw_real *in = d->in;
     fftw_real *out = d->out;
     fftw_plan_node *p = d->p;
     int istride = d->istride, ostride = d->ostride;
     int nthreads = d->nthreads;
     
     for (; min < max; ++min)
        rexec_simple_threads(m, in + min * (m * istride),
                         out + min * ostride,
                         p,
                         istride, ostride * r,
                         nthreads);

     return 0;
}

static void spawn_h2hc_recurse_threads(int m, int r,
                               fftw_real *in,
                               fftw_real *out,
                               fftw_plan_node *p,
                               int istride,
                               int ostride,
                               int nthreads)
{
     rexec_simple_data d;
     
     d.m = m; d.r = r;
     d.in = in; d.out = out;
     d.p = p->nodeu.hc2hc.recurse;
     d.istride = istride;
     d.ostride = ostride;
     d.nthreads = nthreads / r;
     
     switch (p->nodeu.hc2hc.dir) {
       case FFTW_REAL_TO_COMPLEX:
            fftw_thread_spawn_loop(r, nthreads,
                             rexec_simple_thread_r2c, &d);
            break;
       case FFTW_COMPLEX_TO_REAL:
            fftw_thread_spawn_loop(r, nthreads,
                             rexec_simple_thread_c2r, &d);
            break;
     }
}

static void rexec_simple_threads(int n, fftw_real *in, fftw_real *out,
                         fftw_plan_node *p,
                         int istride,
                         int ostride,
                         int nthreads)
{
     switch (p->type) {
       case FFTW_REAL2HC:
            HACK_ALIGN_STACK_ODD;
            (p->nodeu.real2hc.codelet) (in, out, out + n * ostride,
                                istride, ostride, -ostride);
            break;

       case FFTW_HC2REAL:
            HACK_ALIGN_STACK_ODD;
            (p->nodeu.hc2real.codelet) (in, in + n * istride, out,
                                istride, -istride, ostride);
            break;

       case FFTW_HC2HC:
            {
               int r = p->nodeu.hc2hc.size;
               int m = n / r;
               int i;
               fftw_hc2hc_codelet *codelet;
               fftw_complex *W;

               if (nthreads <= 1) { 
                  switch (p->nodeu.hc2hc.dir) {
                      case FFTW_REAL_TO_COMPLEX:
                         for (i = 0; i < r; ++i)
                              rfftw_executor_simple(m,
                                        in + i * istride,
                                        out + i * (m * ostride),
                                        p->nodeu.hc2hc.recurse,
                                        istride * r, ostride,
                                              FFTW_NORMAL_RECURSE);

                         W = p->nodeu.hc2hc.tw->twarray;
                         codelet = p->nodeu.hc2hc.codelet;
                         HACK_ALIGN_STACK_EVEN;
                         codelet(out, W, m * ostride, m, ostride);
                         break;
                      case FFTW_COMPLEX_TO_REAL:
                         W = p->nodeu.hc2hc.tw->twarray;
                         codelet = p->nodeu.hc2hc.codelet;
                         HACK_ALIGN_STACK_EVEN;
                         codelet(in, W, m * istride, m, istride);
                         
                         for (i = 0; i < r; ++i)
                              rfftw_executor_simple(m,
                                        in + i * (m * istride),
                                        out + i * ostride,
                                        p->nodeu.hc2hc.recurse,
                                        istride, ostride * r,
                                        FFTW_NORMAL_RECURSE);
                         break;
                      default:
                         goto bug;
                  }
               }
               else
                  switch (p->nodeu.hc2hc.dir) {
                      case FFTW_REAL_TO_COMPLEX:
                         spawn_h2hc_recurse_threads(m, r, in, out, p,
                                              istride, ostride,
                                              nthreads);

                         W = p->nodeu.hc2hc.tw->twarray;
                         codelet = p->nodeu.hc2hc.codelet;
                         HACK_ALIGN_STACK_EVEN;
                         codelet(out, W, m * ostride, m, ostride);

                         break;
                      case FFTW_COMPLEX_TO_REAL:
                         W = p->nodeu.hc2hc.tw->twarray;
                         codelet = p->nodeu.hc2hc.codelet;
                         HACK_ALIGN_STACK_EVEN;
                         codelet(in, W, m * istride, m, istride);
                         
                         spawn_h2hc_recurse_threads(m, r, in, out, p,
                                              istride, ostride,
                                              nthreads);
                         break;
                  }
               
               break;
            }

       case FFTW_RGENERIC:
            {
               int r = p->nodeu.rgeneric.size;
               int m = n / r;
               int i;
               fftw_rgeneric_codelet *codelet = p->nodeu.rgeneric.codelet;
               fftw_complex *W = p->nodeu.rgeneric.tw->twarray;

               if (nthreads <= 1)
                  switch (p->nodeu.rgeneric.dir) {
                      case FFTW_REAL_TO_COMPLEX:
                         for (i = 0; i < r; ++i)
                              rfftw_executor_simple(m,
                                        in + i * istride,
                                        out + i * (m * ostride),
                                        p->nodeu.rgeneric.recurse,
                                        istride * r, ostride,
                                        FFTW_NORMAL_RECURSE);
                         
                         codelet(out, W, m, r, n, ostride);
                         break;
                      case FFTW_COMPLEX_TO_REAL:
                         codelet(in, W, m, r, n, istride);
                         
                         for (i = 0; i < r; ++i)
                              rfftw_executor_simple(m,
                                        in + i * m * istride,
                                        out + i * ostride,
                                        p->nodeu.rgeneric.recurse,
                                        istride, ostride * r,
                                        FFTW_NORMAL_RECURSE);
                         break;
                      default:
                         goto bug;
                  }
               else
                  switch (p->nodeu.hc2hc.dir) {
                      case FFTW_REAL_TO_COMPLEX:
                         spawn_h2hc_recurse_threads(m, r, in, out, p,
                                              istride, ostride,
                                              nthreads);
                         codelet(out, W, m, r, n, ostride);
                         break;
                      case FFTW_COMPLEX_TO_REAL:
                         codelet(in, W, m, r, n, istride);
                         spawn_h2hc_recurse_threads(m, r, in, out, p,
                                              istride, ostride,
                                              nthreads);
                         break;
                  }

               break;
            }

       default:
          bug:
            fftw_die("BUG in rexecutor: invalid plan\n");
            break;
     }
}

static void rexecutor_simple_inplace_threads(int n, fftw_real *in,
                                   fftw_real *out,
                                   fftw_plan_node *p,
                                   int istride,
                                   int nthreads)
{
     switch (p->type) {
       case FFTW_REAL2HC:
            HACK_ALIGN_STACK_ODD;
            (p->nodeu.real2hc.codelet) (in, in, in + n * istride,
                                istride, istride, -istride);
            break;

       case FFTW_HC2REAL:
            HACK_ALIGN_STACK_ODD;
            (p->nodeu.hc2real.codelet) (in, in + n * istride, in,
                                istride, -istride, istride);
            break;

       default:
            {
               fftw_real *tmp;

               if (out)
                  tmp = out;
               else
                  tmp = (fftw_real *) fftw_malloc(n * sizeof(fftw_real));

               rexec_simple_threads(n, in, tmp, p, istride, 1, nthreads);
               rfftw_strided_copy(n, tmp, istride, in);

               if (!out)
                  fftw_free(tmp);
            }
     }
}

typedef struct {
     union {
          fftw_real2hc_codelet *r2c_codelet;
          fftw_hc2real_codelet *c2r_codelet;
        fftw_plan_node *p;
     } u;
     int n;
     fftw_real *in;
     fftw_real *out;
     int idist, odist, istride, ostride;
} rexec_many_data;

static void *rexec_many_r2c_codelet_thread(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     rexec_many_data *d = (rexec_many_data *) ldata->data;
     fftw_real2hc_codelet *r2c_codelet = d->u.r2c_codelet;
     int n = d->n;
     fftw_real *in = d->in;
     fftw_real *out = d->out;
     int idist = d->idist, odist = d->odist;
     int istride = d->istride, ostride = d->ostride;

     HACK_ALIGN_STACK_ODD;
     for (; min < max; ++min)
          r2c_codelet(in + min * idist,
                  out + min * odist,
                  out + n * ostride + min * odist,
                  istride, ostride, -ostride);
     return 0;
}

static void *rexec_many_c2r_codelet_thread(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     rexec_many_data *d = (rexec_many_data *) ldata->data;
     fftw_hc2real_codelet *c2r_codelet = d->u.c2r_codelet;
     int n = d->n;
     fftw_real *in = d->in;
     fftw_real *out = d->out;
     int idist = d->idist, odist = d->odist;
     int istride = d->istride, ostride = d->ostride;

     HACK_ALIGN_STACK_ODD;
     for (; min < max; ++min)
          c2r_codelet(in + min * idist,
                  in + n * istride + min * idist,
                  out + min * odist,
                  istride, -istride, ostride);
     return 0;
}

static void *rexec_many_simple_thread(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     rexec_many_data *d = (rexec_many_data *) ldata->data;
     fftw_plan_node *p = d->u.p;
     int n = d->n;
     fftw_real *in = d->in;
     fftw_real *out = d->out;
     int idist = d->idist, odist = d->odist;
     int istride = d->istride, ostride = d->ostride;

     for (; min < max; ++min)
          rfftw_executor_simple(n, in + min * idist,
                        out + min * odist,
                        p, istride, ostride,
                        FFTW_NORMAL_RECURSE);

     return 0;
}

static void rexecutor_many_threads(int n, fftw_real *in,
                           fftw_real *out,
                           fftw_plan_node *p,
                           int istride,
                           int ostride,
                           int howmany, int idist, int odist,
                           int nthreads)
{
     if (nthreads > howmany)
        nthreads = howmany;

     switch (p->type) {
       case FFTW_REAL2HC:
       {
            int s;
            fftw_real2hc_codelet *codelet = p->nodeu.real2hc.codelet;
            
            if (nthreads <= 1) {
               HACK_ALIGN_STACK_ODD;
               for (s = 0; s < howmany; ++s)
                  codelet(in + s * idist, out + s * odist,
                        out + n * ostride + s * odist,
                        istride, ostride, -ostride);
            }
            else {
               rexec_many_data d;
               
               d.n = n;
               d.in = in;
               d.out = out;
               d.u.r2c_codelet = codelet;
               d.istride = istride;
               d.ostride = ostride;
               d.idist = idist;
               d.odist = odist;
               fftw_thread_spawn_loop(howmany, nthreads,
                                rexec_many_r2c_codelet_thread, &d);
            }
            
            break;
       }
       
       case FFTW_HC2REAL:
       {
            int s;
            fftw_hc2real_codelet *codelet = p->nodeu.hc2real.codelet;
            
            if (nthreads <= 1) {
               HACK_ALIGN_STACK_ODD;
               for (s = 0; s < howmany; ++s)
                  codelet(in + s * idist,
                        in + n * istride + s * idist,
                        out + s * odist,
                        istride, -istride, ostride);
            }
            else {
               rexec_many_data d;

               d.n = n;
               d.in = in;
               d.out = out;
               d.u.c2r_codelet = codelet;
               d.istride = istride;
               d.ostride = ostride;
               d.idist = idist;
               d.odist = odist;
               fftw_thread_spawn_loop(howmany, nthreads,
                                rexec_many_c2r_codelet_thread, &d);
            }
            
            break;
       }
       
       default:
       {
            int s;               
            
            if (nthreads <= 1)
               for (s = 0; s < howmany; ++s) {
                  rfftw_executor_simple(n, in + s * idist,
                                    out + s * odist,
                                    p, istride, ostride,
                                    FFTW_NORMAL_RECURSE);
               }
            else {
               rexec_many_data d;

               d.in = in; d.out = out;
               d.n = n;
               d.u.p = p;
               d.istride = istride;
               d.ostride = ostride;
               d.idist = idist;
               d.odist = odist;
               fftw_thread_spawn_loop(howmany, nthreads,
                                rexec_many_simple_thread, &d);
            }
       }
     }
}

static void *rexec_many_simple_inplace_thread(fftw_loop_data *ldata)
{
     int min = ldata->min, max = ldata->max;
     rexec_many_data *d = (rexec_many_data *) ldata->data;
     fftw_plan_node *p = d->u.p;
     int n = d->n;
     fftw_real *in = d->in;
     fftw_real *out = d->out + n * ldata->thread_num;
     int idist = d->idist;
     int istride = d->istride;

     for (; min < max; ++min) {
          rfftw_executor_simple(n, in + min * idist, out, p, istride, 1,
                        FFTW_NORMAL_RECURSE);
        rfftw_strided_copy(n, out, istride, in + min * idist);
     }

     return 0;
}

static void rexecutor_many_inplace_threads(int n, fftw_real *in,
                                 fftw_real *out,
                                 fftw_plan_node *p,
                                 int istride,
                                 int howmany, int idist,
                                 int nthreads)
{
     switch (p->type) {
       case FFTW_REAL2HC:
       {
            int s;
            fftw_real2hc_codelet *codelet = p->nodeu.real2hc.codelet;
            
            if (nthreads <= 1) {
               HACK_ALIGN_STACK_ODD;
               for (s = 0; s < howmany; ++s)
                  codelet(in + s * idist, in + s * idist,
                        in + n * istride + s * idist,
                        istride, istride, -istride);
            }
            else {
               rexec_many_data d;
               
               d.n = n;
               d.in = in;
               d.out = in;
               d.u.r2c_codelet = codelet;
               d.istride = istride;
               d.ostride = istride;
               d.idist = idist;
               d.odist = idist;
               fftw_thread_spawn_loop(howmany, nthreads,
                                rexec_many_r2c_codelet_thread, &d);
            }
            
            break;
       }
       
       case FFTW_HC2REAL:
       {
            int s;
            fftw_hc2real_codelet *codelet = p->nodeu.hc2real.codelet;
            
            if (nthreads <= 1) {
               HACK_ALIGN_STACK_ODD;
               for (s = 0; s < howmany; ++s)
                  codelet(in + s * idist,
                        in + n * istride + s * idist,
                        in + s * idist,
                        istride, -istride, istride);
            }
            else {
               rexec_many_data d;
               
               d.n = n;
               d.in = in;
               d.out = in;
               d.u.c2r_codelet = codelet;
               d.istride = istride;
               d.ostride = istride;
               d.idist = idist;
               d.odist = idist;
               fftw_thread_spawn_loop(howmany, nthreads,
                                rexec_many_c2r_codelet_thread, &d);
            }
            
            break;
       }
       
       default:
       {
            int s;
            fftw_real *tmp;
            
            if (nthreads > howmany)
               nthreads = howmany;
            
            if (nthreads <= 1) {
               if (out)
                  tmp = out;
               else
                  tmp =(fftw_real *) fftw_malloc(n * 
                                           sizeof(fftw_real));
               
               for (s = 0; s < howmany; ++s) {
                  rfftw_executor_simple(n,
                                    in + s * idist,
                                    tmp,
                                    p, istride, 1,
                                    FFTW_NORMAL_RECURSE);
                  rfftw_strided_copy(n, tmp, istride,
                                 in + s * idist);
               }
               
               if (!out)
                  fftw_free(tmp);
            }
            else {
               rexec_many_data d;
               
               tmp = (fftw_real *)
                  fftw_malloc(nthreads * n * sizeof(fftw_real));

               d.in = in; 
               d.out = tmp;
               d.n = n;
               d.u.p = p;
               d.istride = istride;
               d.ostride = 1;
               d.idist = idist;
               d.odist = 0;
               fftw_thread_spawn_loop(howmany, nthreads,
                                rexec_many_simple_inplace_thread,&d);
               
               fftw_free(tmp);
            }
       }
     }
}

/* user interface */
void rfftw_threads(int nthreads,
               fftw_plan plan, int howmany, fftw_real *in, int istride,
               int idist, fftw_real *out, int ostride, int odist)
{
     int n = plan->n;

     if (plan->flags & FFTW_IN_PLACE) {
        if (howmany == 1) {
             rexecutor_simple_inplace_threads(n, in, out, plan->root, 
                                    istride, nthreads);
        } else {
             rexecutor_many_inplace_threads(n, in, out, plan->root,
                                    istride, howmany, idist,
                                    nthreads);
        }
     } else {
        if (howmany == 1) {
             rexec_simple_threads(n, in, out, plan->root, istride, ostride,
                            nthreads);
        } else {
             rexecutor_many_threads(n, in, out, plan->root, istride, ostride,
                              howmany, idist, odist, nthreads);
        }
     }
}

void rfftw_threads_one(int nthreads,
                   fftw_plan plan, fftw_real *in, fftw_real *out)
{
     int n = plan->n;

     if (plan->flags & FFTW_IN_PLACE)
        rexecutor_simple_inplace_threads(n, in, out, plan->root, 1,
                                 nthreads);
     else
        rexec_simple_threads(n, in, out, plan->root, 1, 1, nthreads);
}

Generated by  Doxygen 1.6.0   Back to index