oomph-lib: preconditioner_array.cc Source File

Go to the documentation of this file.
// LIC// ====================================================================
// LIC// This file forms part of oomph-lib, the object-oriented,
// LIC// multi-physics finite-element library, available
// LIC// at http://www.oomph-lib.org.
// LIC//
// LIC// Copyright (C) 2006-2025 Matthias Heil and Andrew Hazel
// LIC//
// LIC// This library is free software; you can redistribute it and/or
// LIC// modify it under the terms of the GNU Lesser General Public
// LIC// License as published by the Free Software Foundation; either
// LIC// version 2.1 of the License, or (at your option) any later version.
// LIC//
// LIC// This library is distributed in the hope that it will be useful,
// LIC// but WITHOUT ANY WARRANTY; without even the implied warranty of
// LIC// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// LIC// Lesser General Public License for more details.
// LIC//
// LIC// You should have received a copy of the GNU Lesser General Public
// LIC// License along with this library; if not, write to the Free Software
// LIC// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
// LIC// 02110-1301  USA.
// LIC//
// LIC// The authors may be contacted at oomph-lib@maths.man.ac.uk.
// LIC//
// LIC//====================================================================
 
// Config header
#ifdef HAVE_CONFIG_H
#include <oomph-lib-config.h>
#endif
 
// Preconditioner array is only useful if we have mpi, otherwise a dummy
// implmentation is used and this file doesn't need to implement anything
// (see the header file).
#ifdef OOMPH_HAS_MPI
 
// oomph-lib includes
#include "preconditioner_array.h"
 
namespace oomph
{
  //============================================================================
  /// Setup the preconditioners. Sets up each preconditioner in the
  /// array for the corresponding matrix in the vector matrix_pt.
  /// The number of preconditioners in the array is taken to be the length of
  /// prec_pt.
  //============================================================================
  void PreconditionerArray::setup_preconditioners(
    Vector<CRDoubleMatrix*> matrix_pt,
    Vector<Preconditioner*> prec_pt,
    const OomphCommunicator* comm_pt)
  {
    // clean memory
    this->clean_up_memory();
 
    // get the number of preconditioners in the array
    Nprec = prec_pt.size();
 
#ifdef PARANOID
    // check that the preconditioners have been set
    if (Nprec < 2)
    {
      std::ostringstream error_message;
      error_message << "The PreconditionerArray requires at least 2 "
                    << "preconditioners";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    // first check that the vector matrix_pt is the correct length
    if (matrix_pt.size() != Nprec)
    {
      std::ostringstream error_message;
      error_message << "The same number of preconditioners and matrices must "
                    << "be passed to the setup_preconditioners(...).";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // Resize the storage of the PARANOID check distributions
    // Already cleared by clean_up_memory call at top of function
    Distribution_pt.resize(Nprec);
#endif
 
    // for each matrix...  PARANOID and store copy of global communicator
    for (unsigned i = 0; i < Nprec; i++)
    {
#ifdef PARANOID
      // paranoid check that each matrix is a CRDoubleMatrix and that
      // it is built
      if (matrix_pt[i] == 0)
      {
        std::ostringstream error_message;
        error_message << "matrix_pt[" << i << "] = NULL.";
        throw OomphLibError(error_message.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
 
      // check the matrix has been built
      if (!matrix_pt[i]->built())
      {
        std::ostringstream error_message;
        error_message << "Matrix " << i << " has not been built.";
        throw OomphLibError(error_message.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
#endif
 
      // check that all the matrices have the same communicator
      // and store a copy of the communicator
      if (i == 0)
      {
        Global_communicator_pt = new OomphCommunicator(
          matrix_pt[i]->distribution_pt()->communicator_pt());
      }
 
#ifdef PARANOID
      else
      {
        if (*Global_communicator_pt !=
            *matrix_pt[i]->distribution_pt()->communicator_pt())
        {
          std::ostringstream error_message;
          error_message << "All matrices must have the same communicator.";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
        }
      }
 
      // store a copy of the Distribution of each preconditioner for future
      // PARANOID checks
      Distribution_pt[i] =
        new LinearAlgebraDistribution(matrix_pt[i]->distribution_pt());
#endif
    }
 
    // number of processors
    unsigned nproc = Global_communicator_pt->nproc();
 
    // next compute the distribution of the preconditioner over the processors
    // such that each preconditioner has an (as to near to) equal number of
    // processors
    First_proc_for_prec.resize(Nprec);
    Nproc_for_prec.resize(Nprec);
 
    // compute first row
    for (unsigned p = 0; p < Nprec; p++)
    {
      First_proc_for_prec[p] = unsigned(double(p * nproc) / double(Nprec));
    }
 
    // compute local nrow
    for (unsigned p = 0; p < Nprec - 1; p++)
    {
      Nproc_for_prec[p] = First_proc_for_prec[p + 1] - First_proc_for_prec[p];
    }
    Nproc_for_prec[Nprec - 1] = nproc - First_proc_for_prec[Nprec - 1];
 
#ifdef PARANOID
    // paranoid check that every preconditioner has more than one processor
    for (unsigned p = 0; p < Nprec; p++)
    {
      if (Nproc_for_prec[p] == 0)
      {
        std::ostringstream error_message;
        error_message << "We only have " << nproc << " processor[s]!\n"
                      << "This is not enough to perform the " << Nprec
                      << " block solves in parallel! Sorry! \n"
                      << "Please run this with more processors or disable the\n"
                      << "request for two-level paralellism.\n";
        throw OomphLibError(error_message.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // compute the color of this processor
    Color = 0;
    unsigned my_rank = Global_communicator_pt->my_rank();
    while (!(First_proc_for_prec[Color] <= my_rank &&
             First_proc_for_prec[Color] + Nproc_for_prec[Color] > my_rank))
    {
      Color++;
    }
 
    // create the local preconditioner
    Local_communicator_pt = Global_communicator_pt->split(Color, my_rank);
 
    // pointer for the local matrix on this processor
    CRDoubleMatrix* local_matrix_pt = 0;
 
    // resize storage for details of the data to be sent and received
    First_row_for_proc.resize(Nprec);
    Nrow_local_for_proc.resize(Nprec);
    First_row_from_proc.resize(Nprec);
    Nrow_local_from_proc.resize(Nprec);
 
    // Vector of MPI_Requests - used for distributed matrices
    Vector<MPI_Request> req;
 
    // Counter for the number of requests used
    unsigned c = 0;
 
    // storage for the target distribution
    Vector<Vector<unsigned>> target_first_row(Nprec);
    Vector<Vector<unsigned>> target_nrow_local(Nprec);
 
    // create storage for the nnz to be sent and received for each
    // preconditioner
    Vector<Vector<unsigned>> nnz_send(Nprec);
    Vector<Vector<unsigned>> nnz_recv(Nprec);
 
 
    /////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////
 
 
    // METHOD 0
    if (Method == 0)
    {
      // for every matrix we assemble the duplicate of the matrix on fewer
      // processors and setup the preconditioner
      for (unsigned i = 0; i < Nprec; i++)
      {
        // if the matrix is global (!distributed) then just construct a copy
        // on the subset of processors
        if (matrix_pt[i]->distributed())
        {
          // first compute the distribution of this preconditioner on its subset
          // of processors
 
          // number of rows for this preconditioner
          unsigned nrow = matrix_pt[i]->nrow();
 
          // setup First_row_for_local_prec and Nrow_local_for_local_prec
          target_first_row[i].resize(nproc);
          target_nrow_local[i].resize(nproc);
          unsigned nproc_local = Nproc_for_prec[i];
          for (unsigned p = 0; p < nproc_local; p++)
          {
            int pp = First_proc_for_prec[i] + p;
            target_first_row[i][pp] =
              unsigned(double(p * nrow) / double(nproc_local));
          }
          for (unsigned p = 0; p < nproc_local - 1; p++)
          {
            int pp = First_proc_for_prec[i] + p;
            target_nrow_local[i][pp] =
              target_first_row[i][pp + 1] - target_first_row[i][pp];
          }
          unsigned last_local_proc = First_proc_for_prec[i] + nproc_local - 1;
          target_nrow_local[i][last_local_proc] =
            nrow - target_first_row[i][last_local_proc];
 
          // get the details of the current distribution
          Vector<unsigned> current_first_row(nproc);
          Vector<unsigned> current_nrow_local(nproc);
          for (unsigned p = 0; p < nproc; p++)
          {
            current_first_row[p] = matrix_pt[i]->first_row(p);
            current_nrow_local[p] = matrix_pt[i]->nrow_local(p);
          }
 
          // resize storage for details of the data to be sent and received
          First_row_for_proc[i].resize(nproc, 0);
          Nrow_local_for_proc[i].resize(nproc, 0);
          First_row_from_proc[i].resize(nproc, 0);
          Nrow_local_from_proc[i].resize(nproc, 0);
 
          // for every processor compute first_row and nrow_local that will
          // will sent and received by this processor
          for (unsigned p = 0; p < nproc; p++)
          {
            // start with data to be sent
            if ((target_first_row[i][p] <
                 (current_first_row[my_rank] + current_nrow_local[my_rank])) &&
                (current_first_row[my_rank] <
                 (target_first_row[i][p] + target_nrow_local[i][p])))
            {
              First_row_for_proc[i][p] =
                std::max(current_first_row[my_rank], target_first_row[i][p]);
              Nrow_local_for_proc[i][p] =
                std::min(
                  (current_first_row[my_rank] + current_nrow_local[my_rank]),
                  (target_first_row[i][p] + target_nrow_local[i][p])) -
                First_row_for_proc[i][p];
            }
 
            // and data to be received
            if ((target_first_row[i][my_rank] <
                 (current_first_row[p] + current_nrow_local[p])) &&
                (current_first_row[p] < (target_first_row[i][my_rank] +
                                         target_nrow_local[i][my_rank])))
            {
              First_row_from_proc[i][p] =
                std::max(current_first_row[p], target_first_row[i][my_rank]);
              Nrow_local_from_proc[i][p] =
                std::min((current_first_row[p] + current_nrow_local[p]),
                         (target_first_row[i][my_rank] +
                          target_nrow_local[i][my_rank])) -
                First_row_from_proc[i][p];
            }
          }
 
          // resize nnz_send
          nnz_send[i].resize(nproc);
 
          // compute the number of nnzs to be sent
          // and the number of send and receive requests to be made (nreq)
          for (unsigned p = 0; p < nproc; p++)
          {
            if (Nrow_local_for_proc[i][p] != 0)
            {
              int* row_start = matrix_pt[i]->row_start();
              unsigned k =
                First_row_for_proc[i][p] - current_first_row[my_rank];
              nnz_send[i][p] =
                row_start[k + Nrow_local_for_proc[i][p]] - row_start[k];
            }
          }
 
          // send nnz to be sent to each processor
          for (unsigned p = 0; p < nproc; p++)
          {
            // dont mpi send to send
            if (p != my_rank)
            {
              // non block send
              if (Nrow_local_for_proc[i][p] != 0)
              {
                // send to other processors
                int tag = this->compute_tag(nproc, my_rank, p, 0);
                MPI_Request tr;
                req.push_back(tr);
                MPI_Isend(&nnz_send[i][p],
                          1,
                          MPI_UNSIGNED,
                          p,
                          tag,
                          Global_communicator_pt->mpi_comm(),
                          &req[c]);
                c++;
              }
            }
          }
 
          // resize nnz_recv
          nnz_recv[i].resize(nproc);
 
          // receive nnz from other processors
          for (unsigned pp = 0; pp < nproc; pp++)
          {
            // next processor to receive from
            unsigned p = (nproc + my_rank - pp) % nproc;
 
            // dont mpi send to send
            if (p != my_rank)
            {
              // blocking recv
              if (Nrow_local_from_proc[i][p] != 0)
              {
                int tag = this->compute_tag(nproc, p, my_rank, 0);
                MPI_Status stat;
                unsigned nnz_temp;
                MPI_Recv(&nnz_temp,
                         1,
                         MPI_UNSIGNED,
                         p,
                         tag,
                         Global_communicator_pt->mpi_comm(),
                         &stat);
                nnz_recv[i][p] = nnz_temp;
              }
            }
 
            // receive from self
            else
            {
              nnz_recv[i][p] = nnz_send[i][p];
            }
          }
 
          // get pointers to the underlying data in the current matrix
          double* values_send = matrix_pt[i]->value();
          int* row_start_send = matrix_pt[i]->row_start();
          int* column_index_send = matrix_pt[i]->column_index();
 
          // send and receive the contents of the vector
          for (unsigned p = 0; p < nproc; p++)
          {
            // use mpi methods to send to and receive from all but my rank
            if (p != my_rank)
            {
              // send
              if (nnz_send[i][p] != 0)
              {
                // compute the offset for row_start
                int offset_n =
                  First_row_for_proc[i][p] - current_first_row[my_rank];
 
                // compute the offset for the values and column_index
                int offset_nnz = row_start_send[offset_n];
 
                // values
                int tag = this->compute_tag(nproc, my_rank, p, 1);
                MPI_Request tr1;
                req.push_back(tr1);
                MPI_Isend(values_send + offset_nnz,
                          int(nnz_send[i][p]),
                          MPI_DOUBLE,
                          p,
                          tag,
                          Global_communicator_pt->mpi_comm(),
                          &req[c]);
                c++;
 
                // column_index
                tag = this->compute_tag(nproc, my_rank, p, 2);
                MPI_Request tr2;
                req.push_back(tr2);
                MPI_Isend(column_index_send + offset_nnz,
                          int(nnz_send[i][p]),
                          MPI_INT,
                          p,
                          tag,
                          Global_communicator_pt->mpi_comm(),
                          &req[c]);
                c++;
 
                // row_start
                tag = this->compute_tag(nproc, my_rank, p, 3);
                MPI_Request tr3;
                req.push_back(tr3);
                MPI_Isend(row_start_send + offset_n,
                          int(Nrow_local_for_proc[i][p]),
                          MPI_INT,
                          p,
                          tag,
                          Global_communicator_pt->mpi_comm(),
                          &req[c]);
                c++;
              }
            }
          }
        }
      }
 
      // for every matrix we assemble the duplicate of the matrix on fewer
      // processors and setup the preconditioner
      for (unsigned i = 0; i < Nprec; i++)
      {
        // if the matrix is global (!distributed) then just construct a copy
        // on the subset of processors
        if (!matrix_pt[i]->distributed())
        {
          oomph_info << "matrix not distributed" << std::endl;
          // if this matrix is to be preconditioned my this processor
          if (i == Color)
          {
            // create the local distribution for this matrix
            LinearAlgebraDistribution* temp_dist_pt =
              new LinearAlgebraDistribution(
                Local_communicator_pt, matrix_pt[i]->nrow(), false);
 
            // create the corresponding matrix
            local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
            delete temp_dist_pt; // (dist has now been copied)
 
            // get pointers to the underlying data
            double* values_pt = matrix_pt[i]->value();
            int* column_index_pt = matrix_pt[i]->column_index();
            int* row_start_pt = matrix_pt[i]->row_start();
 
            // build the matrix without a copy of the data
            local_matrix_pt->build_without_copy(matrix_pt[i]->ncol(),
                                                matrix_pt[i]->nnz(),
                                                values_pt,
                                                column_index_pt,
                                                row_start_pt);
          }
        }
 
        // else we assemble a copy of the matrix distributed over a subset of
        // processors
        else
        {
          // number of rows for this preconditioner
 
          // if we are assembling the matrix on this processor
          if (i == Color)
          {
            // create the local distribution for this matrix
            LinearAlgebraDistribution* temp_dist_pt =
              new LinearAlgebraDistribution(Local_communicator_pt,
                                            target_first_row[i][my_rank],
                                            target_nrow_local[i][my_rank]);
 
            // create the corresponding matrix
            local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
            delete temp_dist_pt; // (dist has now been copied)
 
            // get the number of nnzs to be received from each processor
 
            // total number of nnz to be reveived
            unsigned nnz_total = 0;
            for (unsigned p = 0; p < nproc; p++)
            {
              nnz_total += nnz_recv[i][p];
            }
 
            // compute nnz block start
            Vector<unsigned> nnz_start_proc;
            Vector<unsigned> nnz_start_index;
            unsigned row_ptr = target_first_row[i][my_rank];
            int p = 0;
            unsigned nnz_ptr = 0;
            for (p = 0; p < int(nproc); p++)
            {
              if (First_row_from_proc[i][p] == row_ptr &&
                  Nrow_local_from_proc[i][p] != 0 && nnz_ptr != nnz_total)
              {
                nnz_start_proc.push_back(p);
                nnz_start_index.push_back(nnz_ptr);
                nnz_ptr += nnz_recv[i][p];
                row_ptr += Nrow_local_from_proc[i][p];
                p = -1;
              }
            }
 
            // storage for received data
            double* values_recv = new double[nnz_total];
            int* column_index_recv = new int[nnz_total];
            int* row_start_recv = new int[target_nrow_local[i][my_rank] + 1];
 
            // send and receive the contents of the vector
            for (unsigned pp = 0; pp < nproc; pp++)
            {
              // next processor to receive from
              unsigned p = (nproc + my_rank - pp) % nproc;
 
              // use mpi methods to send to and receive from all but my rank
              if (p != my_rank)
              {
                // just receive
                if (nnz_recv[i][p] != 0)
                {
                  // compute the offset for row_start
                  int offset_n =
                    First_row_from_proc[i][p] - target_first_row[i][my_rank];
 
                  // compute the offset for the values and column_index
                  unsigned k = 0;
                  while (nnz_start_proc[k] != p)
                  {
                    k++;
                  }
                  int offset_nnz = nnz_start_index[k];
 
                  // values
                  int tag = this->compute_tag(nproc, p, my_rank, 1);
                  MPI_Status stat1;
                  MPI_Recv(values_recv + offset_nnz,
                           int(nnz_recv[i][p]),
                           MPI_DOUBLE,
                           p,
                           tag,
                           Global_communicator_pt->mpi_comm(),
                           &stat1);
 
                  // column_index
                  tag = this->compute_tag(nproc, p, my_rank, 2);
                  MPI_Status stat2;
                  MPI_Recv(column_index_recv + offset_nnz,
                           int(nnz_recv[i][p]),
                           MPI_INT,
                           p,
                           tag,
                           Global_communicator_pt->mpi_comm(),
                           &stat2);
 
                  // row_start
                  tag = this->compute_tag(nproc, p, my_rank, 3);
                  MPI_Status stat3;
                  MPI_Recv(row_start_recv + offset_n,
                           int(Nrow_local_from_proc[i][p]),
                           MPI_INT,
                           p,
                           tag,
                           Global_communicator_pt->mpi_comm(),
                           &stat3);
                }
              }
              // otehrwise just send to self (or copy)
              else
              {
                if (nnz_recv[i][p] != 0)
                {
                  // get pointers to the underlying data in the current matrix
                  double* values_send = matrix_pt[i]->value();
                  int* row_start_send = matrix_pt[i]->row_start();
                  int* column_index_send = matrix_pt[i]->column_index();
 
                  // offset for row_start send to self
                  unsigned offset_n_send =
                    First_row_for_proc[i][my_rank] - matrix_pt[i]->first_row(p);
                  // offset for values and column+_index send to self
                  unsigned offset_nnz_send = row_start_send[offset_n_send];
 
                  // offset for row_start receive from self
                  unsigned offset_n_recv = First_row_from_proc[i][my_rank] -
                                           target_first_row[i][my_rank];
 
                  // offset for values and columm_index receive from self
                  unsigned k = 0;
                  while (nnz_start_proc[k] != p)
                  {
                    k++;
                  }
                  unsigned offset_nnz_recv = nnz_start_index[k];
 
                  // and send
 
                  // values and column_index
                  unsigned n_nnz = nnz_send[i][my_rank];
                  for (unsigned j = 0; j < n_nnz; j++)
                  {
                    values_recv[offset_nnz_recv + j] =
                      values_send[offset_nnz_send + j];
                    column_index_recv[offset_nnz_recv + j] =
                      column_index_send[offset_nnz_send + j];
                  }
 
                  // row start
                  unsigned n_n = Nrow_local_from_proc[i][my_rank];
                  for (unsigned j = 0; j < n_n; j++)
                  {
                    row_start_recv[offset_n_recv + j] =
                      row_start_send[offset_n_send + j];
                  }
                }
              }
            }
 
 
            // number of processors contributing to the local vector on this
            // processor
 
            // update the row start
            unsigned nproc_contrib = nnz_start_index.size();
            for (unsigned j = 0; j < nproc_contrib; j++)
            {
              unsigned first = First_row_from_proc[i][nnz_start_proc[j]] -
                               target_first_row[i][my_rank];
              unsigned last =
                first + Nrow_local_from_proc[i][nnz_start_proc[j]];
              unsigned nnz_inc = nnz_start_index[j] - row_start_recv[first];
              for (unsigned k = first; k < last; k++)
              {
                row_start_recv[k] += nnz_inc;
              }
            }
            row_start_recv[target_nrow_local[i][my_rank]] = int(nnz_total);
 
            // build the matrix without a copy of the data
            local_matrix_pt->build_without_copy(matrix_pt[i]->ncol(),
                                                nnz_total,
                                                values_recv,
                                                column_index_recv,
                                                row_start_recv);
          }
        }
      }
 
      // wait for all sends to complete
      if (c != 0)
      {
        Vector<MPI_Status> stat(c);
        MPI_Waitall(c, &req[0], &stat[0]);
      }
    }
 
 
    /////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////
 
 
    // METHOD 1
    else if (Method == 1)
    {
      // temporary storgage for nnz recv
      unsigned* nnz_recv_temp = new unsigned[nproc * Nprec];
      for (unsigned j = 0; j < nproc * Nprec; j++)
      {
        nnz_recv_temp[j] = 0;
      }
 
      // for every matrix we assemble the duplicate of the matrix on fewer
      // processors and setup the preconditioner
      for (unsigned i = 0; i < Nprec; i++)
      {
        // if the matrix is global (!distributed) then just construct a copy
        // on the subset of processors
        if (!matrix_pt[i]->distributed())
        {
          // if this matrix is to be preconditioned my this processor
          if (i == Color)
          {
            // create the local distribution for this matrix
            LinearAlgebraDistribution* temp_dist_pt =
              new LinearAlgebraDistribution(
                Local_communicator_pt, matrix_pt[i]->nrow(), false);
 
            // create the corresponding matrix
            local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
            delete temp_dist_pt; // (dist has now been copied)
 
            // get pointers to the underlying data
            double* values_pt = matrix_pt[i]->value();
            int* column_index_pt = matrix_pt[i]->column_index();
            int* row_start_pt = matrix_pt[i]->row_start();
 
            // build the matrix without a copy of the data
            local_matrix_pt->build_without_copy(matrix_pt[i]->ncol(),
                                                matrix_pt[i]->nnz(),
                                                values_pt,
                                                column_index_pt,
                                                row_start_pt);
          }
        }
 
        // if the matrix is global (!distributed) then just construct a copy
        // on the subset of processors
        else
        {
          // first compute the distribution of this preconditioner on its subset
          // of processors
 
          // number of rows for this preconditioner
          unsigned nrow = matrix_pt[i]->nrow();
 
          // setup First_row_for_local_prec and Nrow_local_for_local_prec
          target_first_row[i].resize(nproc);
          target_nrow_local[i].resize(nproc);
          unsigned nproc_local = Nproc_for_prec[i];
          for (unsigned p = 0; p < nproc_local; p++)
          {
            int pp = First_proc_for_prec[i] + p;
            target_first_row[i][pp] =
              unsigned(double(p * nrow) / double(nproc_local));
          }
          for (unsigned p = 0; p < nproc_local - 1; p++)
          {
            int pp = First_proc_for_prec[i] + p;
            target_nrow_local[i][pp] =
              target_first_row[i][pp + 1] - target_first_row[i][pp];
          }
          unsigned last_local_proc = First_proc_for_prec[i] + nproc_local - 1;
          target_nrow_local[i][last_local_proc] =
            nrow - target_first_row[i][last_local_proc];
 
          // get the details of the current distribution
          Vector<unsigned> current_first_row(nproc);
          Vector<unsigned> current_nrow_local(nproc);
          for (unsigned p = 0; p < nproc; p++)
          {
            current_first_row[p] = matrix_pt[i]->first_row(p);
            current_nrow_local[p] = matrix_pt[i]->nrow_local(p);
          }
 
          // resize storage for details of the data to be sent and received
          First_row_for_proc[i].resize(nproc, 0);
          Nrow_local_for_proc[i].resize(nproc, 0);
          First_row_from_proc[i].resize(nproc, 0);
          Nrow_local_from_proc[i].resize(nproc, 0);
 
          // for every processor compute first_row and nrow_local that will
          // will sent and received by this processor
          for (unsigned p = 0; p < nproc; p++)
          {
            // start with data to be sent
            if ((target_first_row[i][p] <
                 (current_first_row[my_rank] + current_nrow_local[my_rank])) &&
                (current_first_row[my_rank] <
                 (target_first_row[i][p] + target_nrow_local[i][p])))
            {
              First_row_for_proc[i][p] =
                std::max(current_first_row[my_rank], target_first_row[i][p]);
              Nrow_local_for_proc[i][p] =
                std::min(
                  (current_first_row[my_rank] + current_nrow_local[my_rank]),
                  (target_first_row[i][p] + target_nrow_local[i][p])) -
                First_row_for_proc[i][p];
            }
 
            // and data to be received
            if ((target_first_row[i][my_rank] <
                 (current_first_row[p] + current_nrow_local[p])) &&
                (current_first_row[p] < (target_first_row[i][my_rank] +
                                         target_nrow_local[i][my_rank])))
            {
              First_row_from_proc[i][p] =
                std::max(current_first_row[p], target_first_row[i][my_rank]);
              Nrow_local_from_proc[i][p] =
                std::min((current_first_row[p] + current_nrow_local[p]),
                         (target_first_row[i][my_rank] +
                          target_nrow_local[i][my_rank])) -
                First_row_from_proc[i][p];
            }
          }
 
          // resize nnz_send
          nnz_send[i].resize(nproc);
 
          // compute the number of nnzs to be sent
          // and the number of send and receive requests to be made (nreq)
          for (unsigned p = 0; p < nproc; p++)
          {
            if (Nrow_local_for_proc[i][p] != 0)
            {
              int* row_start = matrix_pt[i]->row_start();
              unsigned k =
                First_row_for_proc[i][p] - current_first_row[my_rank];
              nnz_send[i][p] =
                row_start[k + Nrow_local_for_proc[i][p]] - row_start[k];
            }
          }
 
          // resize nnz_recv
          nnz_recv[i].resize(nproc);
 
          // send nnz to be sent to each processor
          for (unsigned p = 0; p < nproc; p++)
          {
            // send and recv
 
            // dont mpi send to self
            if (p != my_rank)
            {
              // non block send
              if (Nrow_local_for_proc[i][p] != 0)
              {
                // send to other processors
                int tag = this->compute_tag(nproc, my_rank, p, 0);
                MPI_Request tr;
                req.push_back(tr);
                MPI_Isend(&nnz_send[i][p],
                          1,
                          MPI_UNSIGNED,
                          p,
                          tag,
                          Global_communicator_pt->mpi_comm(),
                          &req[c]);
                c++;
              }
 
              // non blocking recv
              if (Nrow_local_from_proc[i][p] != 0)
              {
                int tag = this->compute_tag(nproc, p, my_rank, 0);
                MPI_Request tr;
                req.push_back(tr);
                MPI_Irecv(nnz_recv_temp + (i * nproc) + p,
                          1,
                          MPI_UNSIGNED,
                          p,
                          tag,
                          Global_communicator_pt->mpi_comm(),
                          &req[c]);
                c++;
              }
            }
            // receive from self
            else
            {
              if (Nrow_local_for_proc[i][p] != 0)
              {
                nnz_recv_temp[(i * nproc) + p] = nnz_send[i][p];
              }
            }
          }
        }
      }
      if (c != 0)
      {
        Vector<MPI_Status> stat(c);
        MPI_Waitall(c, &req[0], &stat[0]);
        req.clear();
        stat.clear();
      }
      c = 0;
      for (unsigned i = 0; i < Nprec; i++)
      {
        for (unsigned p = 0; p < nproc; p++)
        {
          nnz_recv[i][p] = nnz_recv_temp[(i * nproc) + p];
        }
      }
      delete[] nnz_recv_temp;
 
      // get the number of nnzs to be received from each processor
 
      // total number of nnz to be reveived
      unsigned nnz_total = 0;
      for (unsigned p = 0; p < nproc; p++)
      {
        nnz_total += nnz_recv[Color][p];
      }
 
      // compute nnz block start
      Vector<unsigned> nnz_start_proc;
      Vector<unsigned> nnz_start_index;
      unsigned row_ptr = target_first_row[Color][my_rank];
      int p = 0;
      unsigned nnz_ptr = 0;
      for (p = 0; p < int(nproc); p++)
      {
        if (First_row_from_proc[Color][p] == row_ptr &&
            Nrow_local_from_proc[Color][p] != 0 && nnz_ptr != nnz_total)
        {
          nnz_start_proc.push_back(p);
          nnz_start_index.push_back(nnz_ptr);
          nnz_ptr += nnz_recv[Color][p];
          row_ptr += Nrow_local_from_proc[Color][p];
          p = -1;
        }
      }
 
      // storage for derived datatypes
      Vector<MPI_Datatype> datatypes;
 
      // storage for received data
      double* values_recv = new double[nnz_total];
      int* column_index_recv = new int[nnz_total];
      int* row_start_recv = new int[target_nrow_local[Color][my_rank] + 1];
 
      ///////////////////////////////////////////////////////////////////////////
      // SEND
      ///////////////////////////////////////////////////////////////////////////
      unsigned c_send = 0;
      Vector<MPI_Request> send_req;
 
      // for every matrix we assemble the duplicate of the matrix on fewer
      // processors and setup the preconditioner
      for (unsigned i = 0; i < Nprec; i++)
      {
        // get pointers to the underlying data in the current matrix
        double* values_send = matrix_pt[i]->value();
        int* row_start_send = matrix_pt[i]->row_start();
        int* column_index_send = matrix_pt[i]->column_index();
 
        // send and receive the contents of the vector
        for (unsigned p = 0; p < nproc; p++)
        {
          // use mpi methods to send to and receive from all but my rank
          if (p != my_rank)
          {
            // send
            if (nnz_send[i][p] != 0)
            {
              // create 3 MPI contiguous datatypes
              // + values
              // + column_index
              // + row_start
 
              // values
              MPI_Datatype datatype_values;
              MPI_Type_contiguous(
                int(nnz_send[i][p]), MPI_DOUBLE, &datatype_values);
              MPI_Type_commit(&datatype_values);
              datatypes.push_back(datatype_values);
 
              // column index
              MPI_Datatype datatype_column_index;
              MPI_Type_contiguous(
                int(nnz_send[i][p]), MPI_INT, &datatype_column_index);
              MPI_Type_commit(&datatype_column_index);
              datatypes.push_back(datatype_column_index);
 
              // row start
              MPI_Datatype datatype_row_start;
              MPI_Type_contiguous(
                int(Nrow_local_for_proc[i][p]), MPI_INT, &datatype_row_start);
              MPI_Type_commit(&datatype_row_start);
              datatypes.push_back(datatype_row_start);
 
              // assemble the typelist
              MPI_Datatype typelist[3];
              typelist[0] = datatype_values;
              typelist[1] = datatype_column_index;
              typelist[2] = datatype_row_start;
 
              // compute the offset for row_start
              int offset_n =
                First_row_for_proc[i][p] - matrix_pt[i]->first_row(my_rank);
 
              // compute the offset for the values and column_index
              int offset_nnz = row_start_send[offset_n];
 
              // next compute the displacements
              MPI_Aint displacements[3];
              MPI_Get_address(values_send + offset_nnz, &displacements[0]);
              MPI_Get_address(column_index_send + offset_nnz,
                              &displacements[1]);
              MPI_Get_address(row_start_send + offset_n, &displacements[2]);
              for (int j = 2; j >= 0; j--)
              {
                displacements[j] -= displacements[0];
              }
 
              // set the block lengths
              int block_length[3];
              block_length[0] = block_length[1] = block_length[2] = 1;
 
              // now build the final datatype
              MPI_Datatype send_type;
              MPI_Type_create_struct(
                3, block_length, displacements, typelist, &send_type);
              MPI_Type_commit(&send_type);
              datatypes.push_back(send_type);
 
              // send
              int tag = this->compute_tag(nproc, my_rank, p, 1);
              MPI_Request tr1;
              send_req.push_back(tr1);
              MPI_Isend(values_send + offset_nnz,
                        1,
                        send_type,
                        p,
                        tag,
                        Global_communicator_pt->mpi_comm(),
                        &send_req[c_send]);
              c_send++;
            }
          }
        }
      }
 
      ///////////////////////////////////////////////////////////////////////////
      // RECV
      ///////////////////////////////////////////////////////////////////////////
      unsigned c_recv = 0;
      Vector<MPI_Request> recv_req;
 
      // receive the contents of the vector
      for (unsigned p = 0; p < nproc; p++)
      {
        // use mpi methods to send to and receive from all but my rank
        if (p != my_rank)
        {
          // just receive
          if (nnz_recv[Color][p] != 0)
          {
            // create 3 MPI contiguous datatypes
            // + values
            // + column_index
            // + row_start
 
            // values
            MPI_Datatype datatype_values;
            MPI_Type_contiguous(
              int(nnz_recv[Color][p]), MPI_DOUBLE, &datatype_values);
            MPI_Type_commit(&datatype_values);
            datatypes.push_back(datatype_values);
 
            // column index
            MPI_Datatype datatype_column_index;
            MPI_Type_contiguous(
              int(nnz_recv[Color][p]), MPI_INT, &datatype_column_index);
            MPI_Type_commit(&datatype_column_index);
            datatypes.push_back(datatype_column_index);
 
            // row start
            MPI_Datatype datatype_row_start;
            MPI_Type_contiguous(int(Nrow_local_from_proc[Color][p]),
                                MPI_INT,
                                &datatype_row_start);
            MPI_Type_commit(&datatype_row_start);
            datatypes.push_back(datatype_row_start);
 
            // assemble the typelist
            MPI_Datatype typelist[3];
            typelist[0] = datatype_values;
            typelist[1] = datatype_column_index;
            typelist[2] = datatype_row_start;
 
            // compute the offset for row_start
            int offset_n =
              First_row_from_proc[Color][p] - target_first_row[Color][my_rank];
 
            // compute the offset for the values and column_index
            unsigned k = 0;
            while (nnz_start_proc[k] != p)
            {
              k++;
            }
            int offset_nnz = nnz_start_index[k];
 
            // next compute the displacements
            MPI_Aint displacements[3];
            MPI_Get_address(values_recv + offset_nnz, &displacements[0]);
            MPI_Get_address(column_index_recv + offset_nnz, &displacements[1]);
            MPI_Get_address(row_start_recv + offset_n, &displacements[2]);
            for (int j = 2; j >= 0; j--)
            {
              displacements[j] -= displacements[0];
            }
 
            // set the block lengths
            int block_length[3];
            block_length[0] = block_length[1] = block_length[2] = 1;
 
            // now build the final datatype
            MPI_Datatype recv_type;
            MPI_Type_create_struct(
              3, block_length, displacements, typelist, &recv_type);
            MPI_Type_commit(&recv_type);
            datatypes.push_back(recv_type);
 
            // recv
            int tag = this->compute_tag(nproc, p, my_rank, 1);
            MPI_Request tr1;
            recv_req.push_back(tr1);
            MPI_Irecv(values_recv + offset_nnz,
                      1,
                      recv_type,
                      p,
                      tag,
                      Global_communicator_pt->mpi_comm(),
                      &recv_req[c_recv]);
            c_recv++;
          }
        }
      }
 
      // otherwise send to self (copy)
      if (nnz_recv[Color][my_rank] != 0)
      {
        // get pointers to the underlying data in the current matrix
        double* values_send = matrix_pt[Color]->value();
        int* row_start_send = matrix_pt[Color]->row_start();
        int* column_index_send = matrix_pt[Color]->column_index();
 
        // offset for row_start send to self
        unsigned offset_n_send = First_row_for_proc[Color][my_rank] -
                                 matrix_pt[Color]->first_row(my_rank);
 
        // offset for values and column+_index send to self
        unsigned offset_nnz_send = row_start_send[offset_n_send];
 
        // offset for row_start receive from self
        unsigned offset_n_recv = First_row_from_proc[Color][my_rank] -
                                 target_first_row[Color][my_rank];
 
        // offset for values and columm_index receive from self
        unsigned k = 0;
        while (nnz_start_proc[k] != my_rank)
        {
          k++;
        }
        unsigned offset_nnz_recv = nnz_start_index[k];
 
        // and send
 
        // values and column_index
        unsigned n_nnz = nnz_send[Color][my_rank];
        for (unsigned j = 0; j < n_nnz; j++)
        {
          values_recv[offset_nnz_recv + j] = values_send[offset_nnz_send + j];
          column_index_recv[offset_nnz_recv + j] =
            column_index_send[offset_nnz_send + j];
        }
 
        // row start
        unsigned n_n = Nrow_local_from_proc[Color][my_rank];
        for (unsigned j = 0; j < n_n; j++)
        {
          row_start_recv[offset_n_recv + j] = row_start_send[offset_n_send + j];
        }
      }
 
      // create the local distribution for this matrix
      LinearAlgebraDistribution* temp_dist_pt =
        new LinearAlgebraDistribution(Local_communicator_pt,
                                      target_first_row[Color][my_rank],
                                      target_nrow_local[Color][my_rank]);
 
      // create the corresponding matrix
      local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
      delete temp_dist_pt; // (dist has now been copied)
 
      ///////////////////////////////////////////////////////////////////////////
      // and WAIT...
      ///////////////////////////////////////////////////////////////////////////
      if (c_recv != 0)
      {
        Vector<MPI_Status> recv_stat(c_recv);
        MPI_Waitall(c_recv, &recv_req[0], &recv_stat[0]);
        recv_req.clear();
        recv_stat.clear();
      }
 
      // build the matrix
 
      // update the row start
      unsigned nproc_contrib = nnz_start_index.size();
      for (unsigned j = 0; j < nproc_contrib; j++)
      {
        unsigned first = First_row_from_proc[Color][nnz_start_proc[j]] -
                         target_first_row[Color][my_rank];
        unsigned last = first + Nrow_local_from_proc[Color][nnz_start_proc[j]];
        unsigned nnz_inc = nnz_start_index[j] - row_start_recv[first];
        for (unsigned k = first; k < last; k++)
        {
          row_start_recv[k] += nnz_inc;
        }
      }
      row_start_recv[target_nrow_local[Color][my_rank]] = int(nnz_total);
 
      // build the matrix without a copy of the data
      local_matrix_pt->build_without_copy(matrix_pt[Color]->ncol(),
                                          nnz_total,
                                          values_recv,
                                          column_index_recv,
                                          row_start_recv);
 
      // and finally wait for the sends
      if (c_recv != 0)
      {
        Vector<MPI_Status> send_stat(c_recv);
        MPI_Waitall(c_send, &send_req[0], &send_stat[0]);
        send_req.clear();
        send_stat.clear();
      }
 
      // and clear the datatype
      unsigned ndatatypes = datatypes.size();
      for (unsigned i = 0; i < ndatatypes; i++)
      {
        MPI_Type_free(&datatypes[i]);
      }
    }
 
 
    /////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////
 
 
    // METHOD 2
    else if (Method == 2)
    {
      // temporary storgage for nnz recv
      unsigned* nnz_recv_temp = new unsigned[nproc * Nprec];
      for (unsigned j = 0; j < nproc * Nprec; j++)
      {
        nnz_recv_temp[j] = 0;
      }
 
      // for every matrix we assemble the duplicate of the matrix on fewer
      // processors and setup the preconditioner
      for (unsigned i = 0; i < Nprec; i++)
      {
        // if the matrix is global (!distributed) then just construct a copy
        // on the subset of processors
        if (!matrix_pt[i]->distributed())
        {
          // if this matrix is to be preconditioned my this processor
          if (i == Color)
          {
            // create the local distribution for this matrix
            LinearAlgebraDistribution* temp_dist_pt =
              new LinearAlgebraDistribution(
                Local_communicator_pt, matrix_pt[i]->nrow(), false);
 
            // create the corresponding matrix
            local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
            delete temp_dist_pt; // (dist has now been copied)
 
            // get pointers to the underlying data
            double* values_pt = matrix_pt[i]->value();
            int* column_index_pt = matrix_pt[i]->column_index();
            int* row_start_pt = matrix_pt[i]->row_start();
 
            // build the matrix without a copy of the data
            local_matrix_pt->build_without_copy(matrix_pt[i]->ncol(),
                                                matrix_pt[i]->nnz(),
                                                values_pt,
                                                column_index_pt,
                                                row_start_pt);
          }
        }
 
        // if the matrix is global (!distributed) then just construct a copy
        // on the subset of processors
        else
        {
          // first compute the distribution of this preconditioner on its subset
          // of processors
 
          // number of rows for this preconditioner
          unsigned nrow = matrix_pt[i]->nrow();
 
          // setup First_row_for_local_prec and Nrow_local_for_local_prec
          target_first_row[i].resize(nproc);
          target_nrow_local[i].resize(nproc);
          unsigned nproc_local = Nproc_for_prec[i];
          for (unsigned p = 0; p < nproc_local; p++)
          {
            int pp = First_proc_for_prec[i] + p;
            target_first_row[i][pp] =
              unsigned(double(p * nrow) / double(nproc_local));
          }
          for (unsigned p = 0; p < nproc_local - 1; p++)
          {
            int pp = First_proc_for_prec[i] + p;
            target_nrow_local[i][pp] =
              target_first_row[i][pp + 1] - target_first_row[i][pp];
          }
          unsigned last_local_proc = First_proc_for_prec[i] + nproc_local - 1;
          target_nrow_local[i][last_local_proc] =
            nrow - target_first_row[i][last_local_proc];
 
          // get the details of the current distribution
          Vector<unsigned> current_first_row(nproc);
          Vector<unsigned> current_nrow_local(nproc);
          for (unsigned p = 0; p < nproc; p++)
          {
            current_first_row[p] = matrix_pt[i]->first_row(p);
            current_nrow_local[p] = matrix_pt[i]->nrow_local(p);
          }
 
          // resize storage for details of the data to be sent and received
          First_row_for_proc[i].resize(nproc, 0);
          Nrow_local_for_proc[i].resize(nproc, 0);
          First_row_from_proc[i].resize(nproc, 0);
          Nrow_local_from_proc[i].resize(nproc, 0);
 
          // for every processor compute first_row and nrow_local that will
          // will sent and received by this processor
          for (unsigned p = 0; p < nproc; p++)
          {
            // start with data to be sent
            if ((target_first_row[i][p] <
                 (current_first_row[my_rank] + current_nrow_local[my_rank])) &&
                (current_first_row[my_rank] <
                 (target_first_row[i][p] + target_nrow_local[i][p])))
            {
              First_row_for_proc[i][p] =
                std::max(current_first_row[my_rank], target_first_row[i][p]);
              Nrow_local_for_proc[i][p] =
                std::min(
                  (current_first_row[my_rank] + current_nrow_local[my_rank]),
                  (target_first_row[i][p] + target_nrow_local[i][p])) -
                First_row_for_proc[i][p];
            }
 
            // and data to be received
            if ((target_first_row[i][my_rank] <
                 (current_first_row[p] + current_nrow_local[p])) &&
                (current_first_row[p] < (target_first_row[i][my_rank] +
                                         target_nrow_local[i][my_rank])))
            {
              First_row_from_proc[i][p] =
                std::max(current_first_row[p], target_first_row[i][my_rank]);
              Nrow_local_from_proc[i][p] =
                std::min((current_first_row[p] + current_nrow_local[p]),
                         (target_first_row[i][my_rank] +
                          target_nrow_local[i][my_rank])) -
                First_row_from_proc[i][p];
            }
          }
 
          // resize nnz_send
          nnz_send[i].resize(nproc);
 
          // compute the number of nnzs to be sent
          // and the number of send and receive requests to be made (nreq)
          for (unsigned p = 0; p < nproc; p++)
          {
            if (Nrow_local_for_proc[i][p] != 0)
            {
              int* row_start = matrix_pt[i]->row_start();
              unsigned k =
                First_row_for_proc[i][p] - current_first_row[my_rank];
              nnz_send[i][p] =
                row_start[k + Nrow_local_for_proc[i][p]] - row_start[k];
            }
          }
 
          // resize nnz_recv
          nnz_recv[i].resize(nproc);
 
          // send nnz to be sent to each processor
          for (unsigned p = 0; p < nproc; p++)
          {
            // send and recv
 
            // dont mpi send to self
            if (p != my_rank)
            {
              // non block send
              if (Nrow_local_for_proc[i][p] != 0)
              {
                // send to other processors
                int tag = this->compute_tag(nproc, my_rank, p, 0);
                MPI_Request tr;
                req.push_back(tr);
                MPI_Isend(&nnz_send[i][p],
                          1,
                          MPI_UNSIGNED,
                          p,
                          tag,
                          Global_communicator_pt->mpi_comm(),
                          &req[c]);
                c++;
              }
 
              // non blocking recv
              if (Nrow_local_from_proc[i][p] != 0)
              {
                int tag = this->compute_tag(nproc, p, my_rank, 0);
                MPI_Request tr;
                req.push_back(tr);
                MPI_Irecv(nnz_recv_temp + (i * nproc) + p,
                          1,
                          MPI_UNSIGNED,
                          p,
                          tag,
                          Global_communicator_pt->mpi_comm(),
                          &req[c]);
                c++;
              }
            }
            // receive from self
            else
            {
              if (Nrow_local_for_proc[i][p] != 0)
              {
                nnz_recv_temp[(i * nproc) + p] = nnz_send[i][p];
              }
            }
          }
        }
      }
      if (c != 0)
      {
        Vector<MPI_Status> stat(c);
        MPI_Waitall(c, &req[0], &stat[0]);
        req.clear();
        stat.clear();
        c = 0;
      }
      for (unsigned i = 0; i < Nprec; i++)
      {
        for (unsigned p = 0; p < nproc; p++)
        {
          nnz_recv[i][p] = nnz_recv_temp[(i * nproc) + p];
        }
      }
      delete[] nnz_recv_temp;
 
      // get the number of nnzs to be received from each processor
 
      // total number of nnz to be reveived
      unsigned nnz_total = 0;
      for (unsigned p = 0; p < nproc; p++)
      {
        nnz_total += nnz_recv[Color][p];
      }
 
      // compute nnz block start
      Vector<unsigned> nnz_start_proc;
      Vector<unsigned> nnz_start_index;
      unsigned row_ptr = target_first_row[Color][my_rank];
      int p = 0;
      unsigned nnz_ptr = 0;
      for (p = 0; p < int(nproc); p++)
      {
        if (First_row_from_proc[Color][p] == row_ptr &&
            Nrow_local_from_proc[Color][p] != 0 && nnz_ptr != nnz_total)
        {
          nnz_start_proc.push_back(p);
          nnz_start_index.push_back(nnz_ptr);
          nnz_ptr += nnz_recv[Color][p];
          row_ptr += Nrow_local_from_proc[Color][p];
          p = -1;
        }
      }
 
      // storage for derived datatypes
      Vector<MPI_Datatype> datatypes;
 
      // storage for received data
      double* values_recv = new double[nnz_total];
      int* column_index_recv = new int[nnz_total];
      int* row_start_recv = new int[target_nrow_local[Color][my_rank] + 1];
 
      ///////////////////////////////////////////////////////////////////////////
      // RECV
      ///////////////////////////////////////////////////////////////////////////
      unsigned c_recv = 0;
      Vector<MPI_Request> recv_req;
 
      // receive the contents of the vector
      for (unsigned p = 0; p < nproc; p++)
      {
        // use mpi methods to send to and receive from all but my rank
        if (p != my_rank)
        {
          // just receive
          if (nnz_recv[Color][p] != 0)
          {
            // create 3 MPI contiguous datatypes
            // + values
            // + column_index
            // + row_start
 
            // values
            MPI_Datatype datatype_values;
            MPI_Type_contiguous(
              int(nnz_recv[Color][p]), MPI_DOUBLE, &datatype_values);
            MPI_Type_commit(&datatype_values);
            datatypes.push_back(datatype_values);
 
            // column index
            MPI_Datatype datatype_column_index;
            MPI_Type_contiguous(
              int(nnz_recv[Color][p]), MPI_INT, &datatype_column_index);
            MPI_Type_commit(&datatype_column_index);
            datatypes.push_back(datatype_column_index);
 
            // row start
            MPI_Datatype datatype_row_start;
            MPI_Type_contiguous(int(Nrow_local_from_proc[Color][p]),
                                MPI_INT,
                                &datatype_row_start);
            MPI_Type_commit(&datatype_row_start);
            datatypes.push_back(datatype_row_start);
 
            // assemble the typelist
            MPI_Datatype typelist[3];
            typelist[0] = datatype_values;
            typelist[1] = datatype_column_index;
            typelist[2] = datatype_row_start;
 
            // compute the offset for row_start
            int offset_n =
              First_row_from_proc[Color][p] - target_first_row[Color][my_rank];
 
            // compute the offset for the values and column_index
            unsigned k = 0;
            while (nnz_start_proc[k] != p)
            {
              k++;
            }
            int offset_nnz = nnz_start_index[k];
 
            // next compute the displacements
            MPI_Aint displacements[3];
            MPI_Get_address(values_recv + offset_nnz, &displacements[0]);
            MPI_Get_address(column_index_recv + offset_nnz, &displacements[1]);
            MPI_Get_address(row_start_recv + offset_n, &displacements[2]);
            for (int j = 2; j >= 0; j--)
            {
              displacements[j] -= displacements[0];
            }
 
            // set the block lengths
            int block_length[3];
            block_length[0] = block_length[1] = block_length[2] = 1;
 
            // now build the final datatype
            MPI_Datatype recv_type;
            MPI_Type_create_struct(
              3, block_length, displacements, typelist, &recv_type);
            MPI_Type_commit(&recv_type);
            datatypes.push_back(recv_type);
 
            // recv
            int tag = this->compute_tag(nproc, p, my_rank, 1);
            MPI_Request tr1;
            recv_req.push_back(tr1);
            MPI_Irecv(values_recv + offset_nnz,
                      1,
                      recv_type,
                      p,
                      tag,
                      Global_communicator_pt->mpi_comm(),
                      &recv_req[c_recv]);
            c_recv++;
          }
        }
      }
 
      ///////////////////////////////////////////////////////////////////////////
      // SEND
      ///////////////////////////////////////////////////////////////////////////
      unsigned c_send = 0;
      Vector<MPI_Request> send_req;
 
      // for every matrix we assemble the duplicate of the matrix on fewer
      // processors and setup the preconditioner
      for (unsigned i = 0; i < Nprec; i++)
      {
        // get pointers to the underlying data in the current matrix
        double* values_send = matrix_pt[i]->value();
        int* row_start_send = matrix_pt[i]->row_start();
        int* column_index_send = matrix_pt[i]->column_index();
 
        // send and receive the contents of the vector
        for (unsigned p = 0; p < nproc; p++)
        {
          // use mpi methods to send to and receive from all but my rank
          if (p != my_rank)
          {
            // send
            if (nnz_send[i][p] != 0)
            {
              // create 3 MPI contiguous datatypes
              // + values
              // + column_index
              // + row_start
 
              // values
              MPI_Datatype datatype_values;
              MPI_Type_contiguous(
                int(nnz_send[i][p]), MPI_DOUBLE, &datatype_values);
              MPI_Type_commit(&datatype_values);
              datatypes.push_back(datatype_values);
 
              // column index
              MPI_Datatype datatype_column_index;
              MPI_Type_contiguous(
                int(nnz_send[i][p]), MPI_INT, &datatype_column_index);
              MPI_Type_commit(&datatype_column_index);
              datatypes.push_back(datatype_column_index);
 
              // row start
              MPI_Datatype datatype_row_start;
              MPI_Type_contiguous(
                int(Nrow_local_for_proc[i][p]), MPI_INT, &datatype_row_start);
              MPI_Type_commit(&datatype_row_start);
              datatypes.push_back(datatype_row_start);
 
              // assemble the typelist
              MPI_Datatype typelist[3];
              typelist[0] = datatype_values;
              typelist[1] = datatype_column_index;
              typelist[2] = datatype_row_start;
 
              // compute the offset for row_start
              int offset_n =
                First_row_for_proc[i][p] - matrix_pt[i]->first_row(my_rank);
 
              // compute the offset for the values and column_index
              int offset_nnz = row_start_send[offset_n];
 
              // next compute the displacements
              MPI_Aint displacements[3];
              MPI_Get_address(values_send + offset_nnz, &displacements[0]);
              MPI_Get_address(column_index_send + offset_nnz,
                              &displacements[1]);
              MPI_Get_address(row_start_send + offset_n, &displacements[2]);
              for (int j = 2; j >= 0; j--)
              {
                displacements[j] -= displacements[0];
              }
 
              // set the block lengths
              int block_length[3];
              block_length[0] = block_length[1] = block_length[2] = 1;
 
              // now build the final datatype
              MPI_Datatype send_type;
              MPI_Type_create_struct(
                3, block_length, displacements, typelist, &send_type);
              MPI_Type_commit(&send_type);
              datatypes.push_back(send_type);
 
              // send
              int tag = this->compute_tag(nproc, my_rank, p, 1);
              MPI_Request tr1;
              send_req.push_back(tr1);
              MPI_Isend(values_send + offset_nnz,
                        1,
                        send_type,
                        p,
                        tag,
                        Global_communicator_pt->mpi_comm(),
                        &send_req[c_send]);
              c_send++;
            }
          }
        }
      }
 
      // otherwise send to self (copy)
      if (nnz_recv[Color][my_rank] != 0)
      {
        // get pointers to the underlying data in the current matrix
        double* values_send = matrix_pt[Color]->value();
        int* row_start_send = matrix_pt[Color]->row_start();
        int* column_index_send = matrix_pt[Color]->column_index();
 
        // offset for row_start send to self
        unsigned offset_n_send = First_row_for_proc[Color][my_rank] -
                                 matrix_pt[Color]->first_row(my_rank);
 
        // offset for values and column+_index send to self
        unsigned offset_nnz_send = row_start_send[offset_n_send];
 
        // offset for row_start receive from self
        unsigned offset_n_recv = First_row_from_proc[Color][my_rank] -
                                 target_first_row[Color][my_rank];
 
        // offset for values and columm_index receive from self
        unsigned k = 0;
        while (nnz_start_proc[k] != my_rank)
        {
          k++;
        }
        unsigned offset_nnz_recv = nnz_start_index[k];
 
        // and send
 
        // values and column_index
        unsigned n_nnz = nnz_send[Color][my_rank];
        for (unsigned j = 0; j < n_nnz; j++)
        {
          values_recv[offset_nnz_recv + j] = values_send[offset_nnz_send + j];
          column_index_recv[offset_nnz_recv + j] =
            column_index_send[offset_nnz_send + j];
        }
 
        // row start
        unsigned n_n = Nrow_local_from_proc[Color][my_rank];
        for (unsigned j = 0; j < n_n; j++)
        {
          row_start_recv[offset_n_recv + j] = row_start_send[offset_n_send + j];
        }
      }
 
      // create the local distribution for this matrix
      LinearAlgebraDistribution* temp_dist_pt =
        new LinearAlgebraDistribution(Local_communicator_pt,
                                      target_first_row[Color][my_rank],
                                      target_nrow_local[Color][my_rank]);
 
      // create the corresponding matrix
      local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
      delete temp_dist_pt; // (dist has now been copied)
 
      ///////////////////////////////////////////////////////////////////////////
      // and WAIT...
      ///////////////////////////////////////////////////////////////////////////
      if (c_recv != 0)
      {
        Vector<MPI_Status> recv_stat(c_recv);
        MPI_Waitall(c_recv, &recv_req[0], &recv_stat[0]);
        recv_req.clear();
        recv_stat.clear();
      }
 
      // build the matrix
 
      // update the row start
      unsigned nproc_contrib = nnz_start_index.size();
      for (unsigned j = 0; j < nproc_contrib; j++)
      {
        unsigned first = First_row_from_proc[Color][nnz_start_proc[j]] -
                         target_first_row[Color][my_rank];
        unsigned last = first + Nrow_local_from_proc[Color][nnz_start_proc[j]];
        unsigned nnz_inc = nnz_start_index[j] - row_start_recv[first];
        for (unsigned k = first; k < last; k++)
        {
          row_start_recv[k] += nnz_inc;
        }
      }
      row_start_recv[target_nrow_local[Color][my_rank]] = int(nnz_total);
 
      // build the matrix without a copy of the data
      local_matrix_pt->build_without_copy(matrix_pt[Color]->ncol(),
                                          nnz_total,
                                          values_recv,
                                          column_index_recv,
                                          row_start_recv);
 
      // and finally wait for the sends
      if (c_send != 0)
      {
        Vector<MPI_Status> send_stat(c_send);
        MPI_Waitall(c_send, &send_req[0], &send_stat[0]);
        send_req.clear();
        send_stat.clear();
      }
 
      // and clear the datatype
      unsigned ndatatypes = datatypes.size();
      for (unsigned i = 0; i < ndatatypes; i++)
      {
        MPI_Type_free(&datatypes[i]);
      }
    }
 
 
    /////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////////////
 
 
    // METHOD 3
    else if (Method == 3)
    {
      // temporary storgage for nnz recv
      unsigned* nnz_recv_temp = new unsigned[nproc * Nprec];
      for (unsigned j = 0; j < nproc * Nprec; j++)
      {
        nnz_recv_temp[j] = 0;
      }
 
      // for every matrix we assemble the duplicate of the matrix on fewer
      // processors and setup the preconditioner
      for (unsigned i = 0; i < Nprec; i++)
      {
        // if the matrix is global (!distributed) then just construct a copy
        // on the subset of processors
        if (!matrix_pt[i]->distributed())
        {
          // if this matrix is to be preconditioned my this processor
          if (i == Color)
          {
            // create the local distribution for this matrix
            LinearAlgebraDistribution* temp_dist_pt =
              new LinearAlgebraDistribution(
                Local_communicator_pt, matrix_pt[i]->nrow(), false);
 
            // create the corresponding matrix
            local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
            delete temp_dist_pt; // (dist has now been copied)
 
            // get pointers to the underlying data
            double* values_pt = matrix_pt[i]->value();
            int* column_index_pt = matrix_pt[i]->column_index();
            int* row_start_pt = matrix_pt[i]->row_start();
 
            // build the matrix without a copy of the data
            local_matrix_pt->build_without_copy(matrix_pt[i]->ncol(),
                                                matrix_pt[i]->nnz(),
                                                values_pt,
                                                column_index_pt,
                                                row_start_pt);
          }
        }
 
        // if the matrix is global (!distributed) then just construct a copy
        // on the subset of processors
        else
        {
          // first compute the distribution of this preconditioner on its subset
          // of processors
 
          // number of rows for this preconditioner
          unsigned nrow = matrix_pt[i]->nrow();
 
          // setup First_row_for_local_prec and Nrow_local_for_local_prec
          target_first_row[i].resize(nproc);
          target_nrow_local[i].resize(nproc);
          unsigned nproc_local = Nproc_for_prec[i];
          for (unsigned p = 0; p < nproc_local; p++)
          {
            int pp = First_proc_for_prec[i] + p;
            target_first_row[i][pp] =
              unsigned(double(p * nrow) / double(nproc_local));
          }
          for (unsigned p = 0; p < nproc_local - 1; p++)
          {
            int pp = First_proc_for_prec[i] + p;
            target_nrow_local[i][pp] =
              target_first_row[i][pp + 1] - target_first_row[i][pp];
          }
          unsigned last_local_proc = First_proc_for_prec[i] + nproc_local - 1;
          target_nrow_local[i][last_local_proc] =
            nrow - target_first_row[i][last_local_proc];
 
          // get the details of the current distribution
          Vector<unsigned> current_first_row(nproc);
          Vector<unsigned> current_nrow_local(nproc);
          for (unsigned p = 0; p < nproc; p++)
          {
            current_first_row[p] = matrix_pt[i]->first_row(p);
            current_nrow_local[p] = matrix_pt[i]->nrow_local(p);
          }
 
          // resize storage for details of the data to be sent and received
          First_row_for_proc[i].resize(nproc, 0);
          Nrow_local_for_proc[i].resize(nproc, 0);
          First_row_from_proc[i].resize(nproc, 0);
          Nrow_local_from_proc[i].resize(nproc, 0);
 
          // for every processor compute first_row and nrow_local that will
          // will sent and received by this processor
          for (unsigned p = 0; p < nproc; p++)
          {
            // start with data to be sent
            if ((target_first_row[i][p] <
                 (current_first_row[my_rank] + current_nrow_local[my_rank])) &&
                (current_first_row[my_rank] <
                 (target_first_row[i][p] + target_nrow_local[i][p])))
            {
              First_row_for_proc[i][p] =
                std::max(current_first_row[my_rank], target_first_row[i][p]);
              Nrow_local_for_proc[i][p] =
                std::min(
                  (current_first_row[my_rank] + current_nrow_local[my_rank]),
                  (target_first_row[i][p] + target_nrow_local[i][p])) -
                First_row_for_proc[i][p];
            }
 
            // and data to be received
            if ((target_first_row[i][my_rank] <
                 (current_first_row[p] + current_nrow_local[p])) &&
                (current_first_row[p] < (target_first_row[i][my_rank] +
                                         target_nrow_local[i][my_rank])))
            {
              First_row_from_proc[i][p] =
                std::max(current_first_row[p], target_first_row[i][my_rank]);
              Nrow_local_from_proc[i][p] =
                std::min((current_first_row[p] + current_nrow_local[p]),
                         (target_first_row[i][my_rank] +
                          target_nrow_local[i][my_rank])) -
                First_row_from_proc[i][p];
            }
          }
 
          // resize nnz_send
          nnz_send[i].resize(nproc);
 
          // compute the number of nnzs to be sent
          // and the number of send and receive requests to be made (nreq)
          for (unsigned p = 0; p < nproc; p++)
          {
            if (Nrow_local_for_proc[i][p] != 0)
            {
              int* row_start = matrix_pt[i]->row_start();
              unsigned k =
                First_row_for_proc[i][p] - current_first_row[my_rank];
              nnz_send[i][p] =
                row_start[k + Nrow_local_for_proc[i][p]] - row_start[k];
            }
          }
 
          // resize nnz_recv
          nnz_recv[i].resize(nproc);
 
          // send nnz to be sent to each processor
          for (unsigned p = 0; p < nproc; p++)
          {
            // send and recv
 
            // dont mpi send to self
            if (p != my_rank)
            {
              // non block send
              if (Nrow_local_for_proc[i][p] != 0)
              {
                // send to other processors
                int tag = this->compute_tag(nproc, my_rank, p, 0);
                MPI_Request tr;
                req.push_back(tr);
                MPI_Isend(&nnz_send[i][p],
                          1,
                          MPI_UNSIGNED,
                          p,
                          tag,
                          Global_communicator_pt->mpi_comm(),
                          &req[c]);
                c++;
              }
            }
            // receive from self
            else
            {
              if (Nrow_local_for_proc[i][p] != 0)
              {
                nnz_recv_temp[(i * nproc) + p] = nnz_send[i][p];
              }
            }
          }
        }
      }
 
      for (unsigned i = 0; i < Nprec; i++)
      {
        // resize nnz_recv
        nnz_recv[i].resize(nproc);
 
        // receive nnz from other processors
        for (unsigned pp = 0; pp < nproc; pp++)
        {
          // next processor to receive from
          unsigned p = (nproc + my_rank - pp) % nproc;
 
          // dont mpi send to send
          if (p != my_rank)
          {
            // blocking recv
            if (Nrow_local_from_proc[i][p] != 0)
            {
              int tag = this->compute_tag(nproc, p, my_rank, 0);
              MPI_Status stat;
              unsigned nnz_temp;
              MPI_Recv(&nnz_temp,
                       1,
                       MPI_UNSIGNED,
                       p,
                       tag,
                       Global_communicator_pt->mpi_comm(),
                       &stat);
              nnz_recv[i][p] = nnz_temp;
            }
          }
 
          // receive from self
          else
          {
            nnz_recv[i][p] = nnz_send[i][p];
          }
        }
      }
 
      // get the number of nnzs to be received from each processor
 
      // total number of nnz to be reveived
      unsigned nnz_total = 0;
      for (unsigned p = 0; p < nproc; p++)
      {
        nnz_total += nnz_recv[Color][p];
      }
 
      // compute nnz block start
      Vector<unsigned> nnz_start_proc;
      Vector<unsigned> nnz_start_index;
      unsigned row_ptr = target_first_row[Color][my_rank];
      int p = 0;
      unsigned nnz_ptr = 0;
      for (p = 0; p < int(nproc); p++)
      {
        if (First_row_from_proc[Color][p] == row_ptr &&
            Nrow_local_from_proc[Color][p] != 0 && nnz_ptr != nnz_total)
        {
          nnz_start_proc.push_back(p);
          nnz_start_index.push_back(nnz_ptr);
          nnz_ptr += nnz_recv[Color][p];
          row_ptr += Nrow_local_from_proc[Color][p];
          p = -1;
        }
      }
 
      // storage for derived datatypes
      Vector<MPI_Datatype> datatypes;
 
      // storage for received data
      double* values_recv = new double[nnz_total];
      int* column_index_recv = new int[nnz_total];
      int* row_start_recv = new int[target_nrow_local[Color][my_rank] + 1];
 
      ///////////////////////////////////////////////////////////////////////////
      // RECV
      ///////////////////////////////////////////////////////////////////////////
      unsigned c_recv = 0;
      Vector<MPI_Request> recv_req;
 
      // receive the contents of the vector
      for (unsigned p = 0; p < nproc; p++)
      {
        // use mpi methods to send to and receive from all but my rank
        if (p != my_rank)
        {
          // just receive
          if (nnz_recv[Color][p] != 0)
          {
            // create 3 MPI contiguous datatypes
            // + values
            // + column_index
            // + row_start
 
            // values
            MPI_Datatype datatype_values;
            MPI_Type_contiguous(
              int(nnz_recv[Color][p]), MPI_DOUBLE, &datatype_values);
            MPI_Type_commit(&datatype_values);
            datatypes.push_back(datatype_values);
 
            // column index
            MPI_Datatype datatype_column_index;
            MPI_Type_contiguous(
              int(nnz_recv[Color][p]), MPI_INT, &datatype_column_index);
            MPI_Type_commit(&datatype_column_index);
            datatypes.push_back(datatype_column_index);
 
            // row start
            MPI_Datatype datatype_row_start;
            MPI_Type_contiguous(int(Nrow_local_from_proc[Color][p]),
                                MPI_INT,
                                &datatype_row_start);
            MPI_Type_commit(&datatype_row_start);
            datatypes.push_back(datatype_row_start);
 
            // assemble the typelist
            MPI_Datatype typelist[3];
            typelist[0] = datatype_values;
            typelist[1] = datatype_column_index;
            typelist[2] = datatype_row_start;
 
            // compute the offset for row_start
            int offset_n =
              First_row_from_proc[Color][p] - target_first_row[Color][my_rank];
 
            // compute the offset for the values and column_index
            unsigned k = 0;
            while (nnz_start_proc[k] != p)
            {
              k++;
            }
            int offset_nnz = nnz_start_index[k];
 
            // next compute the displacements
            MPI_Aint displacements[3];
            MPI_Get_address(values_recv + offset_nnz, &displacements[0]);
            MPI_Get_address(column_index_recv + offset_nnz, &displacements[1]);
            MPI_Get_address(row_start_recv + offset_n, &displacements[2]);
            for (int j = 2; j >= 0; j--)
            {
              displacements[j] -= displacements[0];
            }
 
            // set the block lengths
            int block_length[3];
            block_length[0] = block_length[1] = block_length[2] = 1;
 
            // now build the final datatype
            MPI_Datatype recv_type;
            MPI_Type_create_struct(
              3, block_length, displacements, typelist, &recv_type);
            MPI_Type_commit(&recv_type);
            datatypes.push_back(recv_type);
 
            // recv
            int tag = this->compute_tag(nproc, p, my_rank, 1);
            MPI_Request tr1;
            recv_req.push_back(tr1);
            MPI_Irecv(values_recv + offset_nnz,
                      1,
                      recv_type,
                      p,
                      tag,
                      Global_communicator_pt->mpi_comm(),
                      &recv_req[c_recv]);
            c_recv++;
          }
        }
      }
 
      ///////////////////////////////////////////////////////////////////////////
      // SEND
      ///////////////////////////////////////////////////////////////////////////
      unsigned c_send = 0;
      Vector<MPI_Request> send_req;
 
      // for every matrix we assemble the duplicate of the matrix on fewer
      // processors and setup the preconditioner
      for (unsigned i = 0; i < Nprec; i++)
      {
        // get pointers to the underlying data in the current matrix
        double* values_send = matrix_pt[i]->value();
        int* row_start_send = matrix_pt[i]->row_start();
        int* column_index_send = matrix_pt[i]->column_index();
 
        // send and receive the contents of the vector
        for (unsigned p = 0; p < nproc; p++)
        {
          // use mpi methods to send to and receive from all but my rank
          if (p != my_rank)
          {
            // send
            if (nnz_send[i][p] != 0)
            {
              // create 3 MPI contiguous datatypes
              // + values
              // + column_index
              // + row_start
 
              // values
              MPI_Datatype datatype_values;
              MPI_Type_contiguous(
                int(nnz_send[i][p]), MPI_DOUBLE, &datatype_values);
              MPI_Type_commit(&datatype_values);
              datatypes.push_back(datatype_values);
 
              // column index
              MPI_Datatype datatype_column_index;
              MPI_Type_contiguous(
                int(nnz_send[i][p]), MPI_INT, &datatype_column_index);
              MPI_Type_commit(&datatype_column_index);
              datatypes.push_back(datatype_column_index);
 
              // row start
              MPI_Datatype datatype_row_start;
              MPI_Type_contiguous(
                int(Nrow_local_for_proc[i][p]), MPI_INT, &datatype_row_start);
              MPI_Type_commit(&datatype_row_start);
              datatypes.push_back(datatype_row_start);
 
              // assemble the typelist
              MPI_Datatype typelist[3];
              typelist[0] = datatype_values;
              typelist[1] = datatype_column_index;
              typelist[2] = datatype_row_start;
 
              // compute the offset for row_start
              int offset_n =
                First_row_for_proc[i][p] - matrix_pt[i]->first_row(my_rank);
 
              // compute the offset for the values and column_index
              int offset_nnz = row_start_send[offset_n];
 
              // next compute the displacements
              MPI_Aint displacements[3];
              MPI_Get_address(values_send + offset_nnz, &displacements[0]);
              MPI_Get_address(column_index_send + offset_nnz,
                              &displacements[1]);
              MPI_Get_address(row_start_send + offset_n, &displacements[2]);
              for (int j = 2; j >= 0; j--)
              {
                displacements[j] -= displacements[0];
              }
 
              // set the block lengths
              int block_length[3];
              block_length[0] = block_length[1] = block_length[2] = 1;
 
              // now build the final datatype
              MPI_Datatype send_type;
              MPI_Type_create_struct(
                3, block_length, displacements, typelist, &send_type);
              MPI_Type_commit(&send_type);
              datatypes.push_back(send_type);
 
              // send
              int tag = this->compute_tag(nproc, my_rank, p, 1);
              MPI_Request tr1;
              send_req.push_back(tr1);
              MPI_Isend(values_send + offset_nnz,
                        1,
                        send_type,
                        p,
                        tag,
                        Global_communicator_pt->mpi_comm(),
                        &send_req[c_send]);
              c_send++;
            }
          }
        }
      }
 
      // otherwise send to self (copy)
      if (nnz_recv[Color][my_rank] != 0)
      {
        // get pointers to the underlying data in the current matrix
        double* values_send = matrix_pt[Color]->value();
        int* row_start_send = matrix_pt[Color]->row_start();
        int* column_index_send = matrix_pt[Color]->column_index();
 
        // offset for row_start send to self
        unsigned offset_n_send = First_row_for_proc[Color][my_rank] -
                                 matrix_pt[Color]->first_row(my_rank);
 
        // offset for values and column+_index send to self
        unsigned offset_nnz_send = row_start_send[offset_n_send];
 
        // offset for row_start receive from self
        unsigned offset_n_recv = First_row_from_proc[Color][my_rank] -
                                 target_first_row[Color][my_rank];
 
        // offset for values and columm_index receive from self
        unsigned k = 0;
        while (nnz_start_proc[k] != my_rank)
        {
          k++;
        }
        unsigned offset_nnz_recv = nnz_start_index[k];
 
        // and send
 
        // values and column_index
        unsigned n_nnz = nnz_send[Color][my_rank];
        for (unsigned j = 0; j < n_nnz; j++)
        {
          values_recv[offset_nnz_recv + j] = values_send[offset_nnz_send + j];
          column_index_recv[offset_nnz_recv + j] =
            column_index_send[offset_nnz_send + j];
        }
 
        // row start
        unsigned n_n = Nrow_local_from_proc[Color][my_rank];
        for (unsigned j = 0; j < n_n; j++)
        {
          row_start_recv[offset_n_recv + j] = row_start_send[offset_n_send + j];
        }
      }
 
      // create the local distribution for this matrix
      LinearAlgebraDistribution* temp_dist_pt =
        new LinearAlgebraDistribution(Local_communicator_pt,
                                      target_first_row[Color][my_rank],
                                      target_nrow_local[Color][my_rank]);
 
      // create the corresponding matrix
      local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
      delete temp_dist_pt; // (dist has now been copied)
 
      ///////////////////////////////////////////////////////////////////////////
      // and WAIT...
      ///////////////////////////////////////////////////////////////////////////
      if (c_recv != 0)
      {
        Vector<MPI_Status> recv_stat(c_recv);
        MPI_Waitall(c_recv, &recv_req[0], &recv_stat[0]);
        recv_req.clear();
        recv_stat.clear();
      }
 
      // build the matrix
 
      // update the row start
      unsigned nproc_contrib = nnz_start_index.size();
      for (unsigned j = 0; j < nproc_contrib; j++)
      {
        unsigned first = First_row_from_proc[Color][nnz_start_proc[j]] -
                         target_first_row[Color][my_rank];
        unsigned last = first + Nrow_local_from_proc[Color][nnz_start_proc[j]];
        unsigned nnz_inc = nnz_start_index[j] - row_start_recv[first];
        for (unsigned k = first; k < last; k++)
        {
          row_start_recv[k] += nnz_inc;
        }
      }
      row_start_recv[target_nrow_local[Color][my_rank]] = int(nnz_total);
 
      // build the matrix without a copy of the data
      local_matrix_pt->build_without_copy(matrix_pt[Color]->ncol(),
                                          nnz_total,
                                          values_recv,
                                          column_index_recv,
                                          row_start_recv);
 
      // and finally wait for the sends
      if (c_recv != 0)
      {
        Vector<MPI_Status> send_stat(c_recv);
        MPI_Waitall(c_send, &send_req[0], &send_stat[0]);
        send_req.clear();
        send_stat.clear();
      }
 
      // and clear the datatype
      unsigned ndatatypes = datatypes.size();
      for (unsigned i = 0; i < ndatatypes; i++)
      {
        MPI_Type_free(&datatypes[i]);
      }
    }
 
    // now setup the preconditioner
    Preconditioner_pt = prec_pt[Color];
    Preconditioner_pt->setup(local_matrix_pt);
 
    // clean up memory
    if (matrix_pt[0]->distributed())
    {
      delete local_matrix_pt;
    }
 
    // delete the preconditioners not used on this processor
    for (unsigned i = 0; i < Nprec; i++)
    {
      if (i != Color)
      {
        delete prec_pt[i];
      }
    }
  } // end of setup_preconditioners()
 
  //============================================================================
  /// Applies each preconditioner to the corresponding vector in
  /// r and z
  //=============================================================================
  void PreconditionerArray::solve_preconditioners(const Vector<DoubleVector>& r,
                                                  Vector<DoubleVector>& z)
  {
#ifdef PARANOID
    // check that a preconditioner has been setup
    if (Preconditioner_pt == 0)
    {
      std::ostringstream error_message;
      error_message << "The preconditioners have not been setup.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // check that r is the correct length
    if (r.size() != Nprec)
    {
      std::ostringstream error_message;
      error_message << "This PreconditionerArray has " << Nprec
                    << " preconditioners but r only contains " << r.size()
                    << " preconditioners.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // check that z is the correct length
    if (z.size() != Nprec)
    {
      std::ostringstream error_message;
      error_message << "This PreconditionerArray has " << Nprec
                    << " preconditioners but z only contains " << z.size()
                    << " preconditioners.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    // check that the vector has the same distribution as the
    // preconditioner
    for (unsigned i = 0; i < Nprec; i++)
    {
      if (*r[i].distribution_pt() != *Distribution_pt[i])
      {
        std::ostringstream error_message;
        error_message << "The distribution of r[" << i << "] does not have the"
                      << " the same distribution as the matrix_pt[" << i
                      << "] that was passed to setup_preconditioners(...)";
        throw OomphLibError(error_message.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // the local r vector
    DoubleVector local_r(Preconditioner_pt->distribution_pt(), 0.0);
 
    // number of processors
    unsigned nproc = Global_communicator_pt->nproc();
 
    // cache my global rank
    unsigned my_rank = Global_communicator_pt->my_rank();
 
    // send and receive requests
    Vector<MPI_Request> send_reqs;
    Vector<MPI_Request> recv_reqs;
 
    // cache first_row
    unsigned first_row = Preconditioner_pt->first_row();
 
    // local residual values for this processor
    double* local_r_values = local_r.values_pt();
 
    // for every vector we assemble the duplicate of the vector on the
    // appropirate subset of processors
 
    // first we post the non-blocking sends and recvs
    for (unsigned i = 0; i < Nprec; i++)
    {
      if (r[i].distributed())
      {
        // current first_row and nrow_local
        unsigned current_first_row = r[i].first_row();
 
        // send and receive the contents of the vector
        for (unsigned p = 0; p < nproc; p++)
        {
          // use mpi methods to send to and receive from all but my rank
          if (p != my_rank)
          {
            // send
            if (Nrow_local_for_proc[i][p] != 0)
            {
              // compute the offset for the values
              int offset_n = First_row_for_proc[i][p] - current_first_row;
 
              // send the values
              int tag = this->compute_tag(nproc, my_rank, p, 0);
              MPI_Request tr;
              MPI_Isend(const_cast<double*>(r[i].values_pt()) + offset_n,
                        int(Nrow_local_for_proc[i][p]),
                        MPI_DOUBLE,
                        p,
                        tag,
                        Global_communicator_pt->mpi_comm(),
                        &tr);
              send_reqs.push_back(tr);
            }
 
            // recv
            if (Nrow_local_from_proc[i][p] != 0)
            {
              // compute the offset for row_start
              int offset_n = First_row_from_proc[i][p] - first_row;
 
              // row_start
              int tag = this->compute_tag(nproc, p, my_rank, 0);
              MPI_Request tr;
              MPI_Irecv(local_r_values + offset_n,
                        int(Nrow_local_from_proc[i][p]),
                        MPI_DOUBLE,
                        p,
                        tag,
                        Global_communicator_pt->mpi_comm(),
                        &tr);
              recv_reqs.push_back(tr);
            }
          }
        }
      }
    }
 
 
    // and now we send to self
    if (!r[Color].distributed())
    {
      // just copy to the new vector
      const double* r_pt = r[Color].values_pt();
      unsigned nrow_local = local_r.nrow_local();
      for (unsigned i = 0; i < nrow_local; i++)
      {
        local_r_values[i] = r_pt[i];
      }
    }
    else
    {
      // the incoming residual associated with the processor
      const double* r_pt = r[Color].values_pt();
 
      // current first_row and nrow_local
      unsigned current_first_row = r[Color].first_row();
 
      // cache first_row
      unsigned first_row = Preconditioner_pt->first_row();
 
      //
      if (Nrow_local_from_proc[Color][my_rank] != 0)
      {
        // offset for values send to self
        unsigned offset_n_send =
          First_row_for_proc[Color][my_rank] - current_first_row;
 
        // offset for values receive from self
        unsigned offset_n_recv =
          First_row_from_proc[Color][my_rank] - first_row;
 
        // send/receive
        unsigned n_n = Nrow_local_from_proc[Color][my_rank];
        for (unsigned j = 0; j < n_n; j++)
        {
          local_r_values[offset_n_recv + j] = r_pt[offset_n_send + j];
        }
      }
    }
 
    // wait for the receives to complete
    unsigned n_recv = recv_reqs.size();
    if (n_recv)
    {
      MPI_Waitall(n_recv, &recv_reqs[0], MPI_STATUS_IGNORE);
    }
    recv_reqs.clear();
 
    // next solve
    // apply the local preconditioner
    DoubleVector local_z;
    Preconditioner_pt->preconditioner_solve(local_r, local_z);
    local_r.clear();
 
    // the local z values
    double* local_z_values = local_z.values_pt();
 
    // setup the vectors
    for (unsigned i = 0; i < Nprec; i++)
    {
      // if z[i] is not setup then set it up
      if (!z[i].built())
      {
        z[i].build(r[i].distribution_pt(), 0.0);
      }
    }
 
    // first we post the non-blocking sends and recvs
    for (unsigned i = 0; i < Nprec; i++)
    {
      if (r[i].distributed())
      {
        // current first_row and nrow_local
        unsigned current_first_row = r[i].first_row();
 
        // send and receive the contents of the vector
        for (unsigned p = 0; p < nproc; p++)
        {
          // use mpi methods to send to and receive from all but my rank
          if (p != my_rank)
          {
            // send
            if (Nrow_local_for_proc[i][p] != 0)
            {
              // compute the offset for the values
              int offset_n = First_row_for_proc[i][p] - current_first_row;
 
              // send the values
              int tag = this->compute_tag(nproc, my_rank, p, 0);
              MPI_Request tr;
              MPI_Irecv(z[i].values_pt() + offset_n,
                        int(Nrow_local_for_proc[i][p]),
                        MPI_DOUBLE,
                        p,
                        tag,
                        Global_communicator_pt->mpi_comm(),
                        &tr);
              recv_reqs.push_back(tr);
            }
 
            // recv
            if (Nrow_local_from_proc[i][p] != 0)
            {
              // compute the offset for row_start
              int offset_n = First_row_from_proc[i][p] - first_row;
 
              // vector
              int tag = this->compute_tag(nproc, p, my_rank, 0);
              MPI_Request tr;
              MPI_Isend(local_z_values + offset_n,
                        int(Nrow_local_from_proc[i][p]),
                        MPI_DOUBLE,
                        p,
                        tag,
                        Global_communicator_pt->mpi_comm(),
                        &tr);
              send_reqs.push_back(tr);
            }
          }
        }
      }
      // otherwise we need to share the results
      else
      {
        // number of processors associated with this preconditioner
        unsigned nproc_local = Local_communicator_pt->nproc();
 
        // my "proc number" for this preconditioner
        unsigned my_local_rank = Local_communicator_pt->my_rank();
 
        // sends to self completed later
        if (i != Color)
        {
          // post send requests
          for (unsigned j = my_local_rank; j < Nproc_for_prec[i];
               j += nproc_local)
          {
            int p = j + First_proc_for_prec[i];
            MPI_Request tr;
            MPI_Isend(local_z_values,
                      z[Color].nrow(),
                      MPI_DOUBLE,
                      p,
                      0,
                      Global_communicator_pt->mpi_comm(),
                      &tr);
            send_reqs.push_back(tr);
          }
 
          // compute the processor number to recv from
          int p = my_local_rank;
          while ((p - int(Nproc_for_prec[i])) >= 0)
          {
            p -= Nproc_for_prec[i];
          }
          p += First_proc_for_prec[i];
 
          // and recv
          MPI_Request tr;
          MPI_Irecv(z[i].values_pt(),
                    z[i].nrow(),
                    MPI_DOUBLE,
                    p,
                    0,
                    Global_communicator_pt->mpi_comm(),
                    &tr);
          recv_reqs.push_back(tr);
        }
      }
    }
 
    // and now we send to self
    if (!r[Color].distributed())
    {
      // just copy to the new vector
      double* z_pt = z[Color].values_pt();
      unsigned nrow_local = local_z.nrow_local();
      for (unsigned i = 0; i < nrow_local; i++)
      {
        z_pt[i] = local_z_values[i];
      }
    }
    else
    {
      //
      double* z_pt = z[Color].values_pt();
 
      // current first_row and nrow_local
      unsigned current_first_row = r[Color].first_row();
 
      // cache first_row
      unsigned first_row = Preconditioner_pt->first_row();
 
      //
      if (Nrow_local_from_proc[Color][my_rank] != 0)
      {
        // offset for values send to self
        unsigned offset_n_send =
          First_row_for_proc[Color][my_rank] - current_first_row;
 
        // offset for values receive from self
        unsigned offset_n_recv =
          First_row_from_proc[Color][my_rank] - first_row;
 
        // send/receive
        unsigned n_n = Nrow_local_from_proc[Color][my_rank];
        for (unsigned j = 0; j < n_n; j++)
        {
          z_pt[offset_n_send + j] = local_z_values[offset_n_recv + j];
        }
      }
    }
 
 
    // wait for the receives to complete
    n_recv = recv_reqs.size();
    if (n_recv)
    {
      MPI_Waitall(n_recv, &recv_reqs[0], MPI_STATUS_IGNORE);
    }
    recv_reqs.clear();
 
    // wait for the sends to complete
    unsigned n_send = send_reqs.size();
    if (n_send)
    {
      MPI_Waitall(n_send, &send_reqs[0], MPI_STATUS_IGNORE);
    }
    send_reqs.clear();
  }
} // namespace oomph
 
// End of "if we have mpi"
#endif