oomph-lib: block_preconditioner.cc Source File

Go to the documentation of this file.
// LIC// ====================================================================
// LIC// This file forms part of oomph-lib, the object-oriented,
// LIC// multi-physics finite-element library, available
// LIC// at http://www.oomph-lib.org.
// LIC//
// LIC// Copyright (C) 2006-2025 Matthias Heil and Andrew Hazel
// LIC//
// LIC// This library is free software; you can redistribute it and/or
// LIC// modify it under the terms of the GNU Lesser General Public
// LIC// License as published by the Free Software Foundation; either
// LIC// version 2.1 of the License, or (at your option) any later version.
// LIC//
// LIC// This library is distributed in the hope that it will be useful,
// LIC// but WITHOUT ANY WARRANTY; without even the implied warranty of
// LIC// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// LIC// Lesser General Public License for more details.
// LIC//
// LIC// You should have received a copy of the GNU Lesser General Public
// LIC// License along with this library; if not, write to the Free Software
// LIC// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
// LIC// 02110-1301  USA.
// LIC//
// LIC// The authors may be contacted at oomph-lib@maths.man.ac.uk.
// LIC//
// LIC//====================================================================
#include "block_preconditioner.h"
 
namespace oomph
{
  /// Static boolean to allow block_matrix_test(...) to be run.
  /// Defaults to false.
  template<typename MATRIX>
  bool BlockPreconditioner<MATRIX>::Run_block_matrix_test = false;
 
 
  //============================================================================
  /// Determine the size of the matrix blocks and setup the
  /// lookup schemes relating the global degrees of freedom with
  /// their "blocks" and their indices (row/column numbers) in those
  /// blocks.
  /// The distributions of the preconditioner and the blocks are
  /// automatically specified (and assumed to be uniform) at this
  /// stage.
  /// This method should be used if any block contains more than one
  /// type of DOF. The argument vector dof_to_block_map should be of length
  /// ndof. Each element should contain an integer indicating the block number
  /// corresponding to that type of DOF.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::block_setup(
    const Vector<unsigned>& dof_to_block_map_in)
  {
#ifdef PARANOID
    // Subsidiary preconditioners don't really need the meshes
    if (this->is_master_block_preconditioner())
    {
      std::ostringstream err_msg;
      unsigned n = nmesh();
      if (n == 0)
      {
        err_msg << "No meshes have been set for this block preconditioner!\n"
                << "Set one with set_nmesh(...), set_mesh(...)" << std::endl;
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
        for (unsigned m = 0; m < n; m++)
        {
          if (Mesh_pt[m] == 0)
          {
            err_msg << "The mesh pointer to mesh " << m << " is null!\n"
                    << "Set a non-null one with set_mesh(...)" << std::endl;
            throw OomphLibError(
              err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
          }
        }
      }
    }
#endif
 
    // Create a copy of the vector input so that we can modify it below
    Vector<unsigned> dof_to_block_map = dof_to_block_map_in;
 
    if (is_subsidiary_block_preconditioner())
    {
#ifdef PARANOID
      // Get the size of the Doftype_in_master_preconditioner_coarse.
      unsigned para_doftype_in_master_preconditioner_coarse_size =
        Doftype_in_master_preconditioner_coarse.size();
 
      // Check that the Doftype_in_master_preconditioner_coarse vector is not
      // empty. This must be set (via the function
      // turn_into_subsidiary_block_preconditioner) if this is a
      // subsidiary block preconditioner.
      if (para_doftype_in_master_preconditioner_coarse_size == 0)
      {
        std::ostringstream err_msg;
        err_msg << "The mapping from the dof types of the master "
                << "block preconditioner \n"
                << "to the subsidiary block preconditioner is empty.\n"
                << "Doftype_in_master_preconditioner_coarse.size() == 0 \n"
                << "has turn_into_subsidiary_block_preconditioner(...)\n"
                << "been called with the correct parameters?\n"
                << std::endl;
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
 
 
      // PARANOID checks for Doftype_coarsen_map_coarse
      // This is also set in the function
      // turn_into_subsidiary_block_preconditioner(...).
      //
      // The Doftype_coarsen_map_coarse vector must satisfy two conditions
      // for it to be valid.
      //
      // 1) The dof type numbers in the dof_coarsen_map vector must be
      //    unique. For example, it does not make sense to have the vector
      //    [[0,1][1,2]] because the first inner vector says
      //    "treat dof types 0 and 1 as dof type 0" and the second inner vector
      //    says "treat dof type 1 and 2 as dof type 1", but dof type 1 is
      //    already being treated as dof type 0.
      //
      // 2) Every SUBSIDIARY dof type must be mapped to a dof type in the
      //    Doftype_coarsen_map_coarse vector.
      //    For example, if there are 5 dof types (passed down from the master
      //    block preconditioner), and this block subsidiary block
      //    preconditioner only deals with 3 dof types, then all 5 dof types
      //    must be mapped to a dof type in the subsidiary preconditioner. For
      //    example if the dof_map is [1,2,3,4,5], then the subsidiary block
      //    preconditioner knows that 5 dof types have been passed down. But if
      //    it only works with three dof types, we MUST have three inner vectors
      //    in the doftype_coarsen_map vector (which corresponds to dof types 0,
      //    1 and 2), the union of the dof types in the three inner vectors must
      //    contain dof types 0, 1, 2, 3 and 4 exactly once. It cannot contain,
      //    say, 0, 1, 5, 7, 9, even though it passes the uniqueness check. We
      //    ensure this by two conditions:
      //
      //    2.1) The Doftype_coarsen_map_coarse vector must contain the same
      //         number of dof types as the dof_map vector.
      //         In other words, recall that Doftype_coarsen_map_coarse is a
      //         2D vector, this must contain the same number of vectors as
      //         there are elements in the dof_to_block_map_in vector.
      //
      //    2.2) The maximum element in the doftype_coarsen_map_coarse vector
      //         is the length of the dof_map vector minus 1.
 
      // A set is deal for checking the above three conditions, we shall insert
      // all the elements in the doftype_coarsen_map_coarse vector into this
      // set.
      std::set<unsigned> doftype_map_set;
 
      // Condition (1): Check for uniqueness by inserting all the values of
      // Doftype_coarsen_map_coarse into a set.
      unsigned para_doftype_coarsen_map_coarse_size =
        Doftype_coarsen_map_coarse.size();
 
      // Loop through the outer vector of Doftype_coarsen_map_coarse
      // then loop through the inner vectors and attempt to insert each
      // element of Doftype_coarsen_map_coarse into doftype_map_set.
      //
      // The inner for loop will throw an error if we cannot insert the
      // element, this means that it is already inserted and thus not unique.
      for (unsigned i = 0; i < para_doftype_coarsen_map_coarse_size; i++)
      {
        // Loop through the inner vector
        unsigned para_doftype_coarsen_map_coarse_i_size =
          Doftype_coarsen_map_coarse[i].size();
        for (unsigned j = 0; j < para_doftype_coarsen_map_coarse_i_size; j++)
        {
          // Attempt to insert all the values of the inner vector into a set.
          std::pair<std::set<unsigned>::iterator, bool> doftype_map_ret =
            doftype_map_set.insert(Doftype_coarsen_map_coarse[i][j]);
 
          if (!doftype_map_ret.second)
          {
            std::ostringstream err_msg;
            err_msg << "Error: the doftype number "
                    << Doftype_coarsen_map_coarse[i][j]
                    << " is already inserted." << std::endl;
            throw OomphLibError(
              err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
          }
        }
      }
 
      // Condition (2.1): Check that the doftype_map_set describes as many
      // values as doftype_in_master_preconditioner_coarse. I.e. if dof_map
      // contains 5 dof types, then the doftype_coarsen_map_coarse vector must
      // also contain 5 dof types.
      if (para_doftype_in_master_preconditioner_coarse_size !=
          doftype_map_set.size())
      {
        std::ostringstream err_msg;
        err_msg << "The size of doftype_in_master_preconditioner_coarse "
                << "must be the same as the total\n"
                << "number of values in the doftype_coarsen_map_coarse vector."
                << std::endl;
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
 
      // Condition (2.2): Check that the maximum element in the
      // doftype_coarsen_map_coarse vector is the length of the
      // doftype_in_master_preconditioner_coarse minus 1.
      unsigned para_doftype_in_master_preconditioner_coarse_size_minus_one =
        para_doftype_in_master_preconditioner_coarse_size - 1;
      if (para_doftype_in_master_preconditioner_coarse_size_minus_one !=
          *doftype_map_set.rbegin())
      {
        std::ostringstream err_msg;
        err_msg << "The maximum dof type number in the "
                << "doftype_coarsen_map vector must be "
                << para_doftype_in_master_preconditioner_coarse_size_minus_one
                << std::endl;
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
#endif
 
      // Set the mapping from the master preconditioner DOF types to the
      // subsidiary preconditioner DOF types.
      //
      // IMPORTANT: Since DOF types may be coarsened in the master block
      // preconditioner, this may no longer reflect the actual underlying dof
      // types. We must get the actual underlying dof types for the
      // block_setup(...) function to work properly so all the look up schemes
      // for this (subsidiary) block preconditioner is correct and works
      // properly, this is for backwards compatibility purposes and to make sure
      // Richard Muddle's still works at this (subsidiary) level, although it
      // may not be used.
      //
      // If we do not want to make it backwards compatible, we may as well
      // kill the block_setup(...) for subsidiary block preconditioners -
      // but other thing may break. Do it at your own risk (take time to
      // fully understand the whole block preconditioning framework code).
 
      // Create the corresponding Doftype_in_master_preconditioner_fine and
      // Doftype_coarsen_map_fine vectors.
 
      // First resize the vectors.
      Doftype_in_master_preconditioner_fine.resize(0);
      Doftype_coarsen_map_fine.resize(0);
 
      // The Doftype_in_master_preconditioner_fine vector is easy.  We know that
      // the Doftype_coarsen_map_fine in the master preconditioner must be
      // constructed already. So we simply loop through the values in
      // doftype_in_master_preconditioner_coarse, then get the most fine grain
      // dof types from the master preconditioner's Doftype_coarsen_map_fine
      // vector.
      //
      // For example, if the master preconditioner has the vector:
      // Doftype_coarsen_map_fine = [0,1,2,3][4,5,6,7][8,9,10,11][12,13][14,15]
      //
      // and passes the two vectors
      // doftype_in_master_preconditioner_coarse = [1,2,3]
      // doftype_coarsen_map_coarse = [[0][1,2]]
      //
      // Then we want
      // Doftype_in_master_preconditioner_fine = [4,5,6,7,8,9,10,11,12,13]
      //
      // We achieve this by looking up the corresponding fine dof types in the
      // masters' Doftype_coarsen_map_fine vector which corresponds to the
      // values in Doftype_in_master_preconditioner_coarse.
      //
      // That is, the values in Doftype_in_master_preconditioner_coarse gives us
      // the index of sub vector we want in the master's
      // Doftype_coarsen_map_fine vector.
 
#ifdef PARANOID
      // Check that the master block preconditioner's Doftype_coarsen_map_fine
      // is set up. Under the current implementation, this would always be set
      // up properly, but we check it just in case!
      if (master_block_preconditioner_pt()->doftype_coarsen_map_fine().size() ==
          0)
      {
        std::ostringstream err_msg;
        err_msg << "The master block preconditioner's "
                << "Doftype_coarsen_map_fine is not\n"
                << "set up properly.\n"
                << "\n"
                << "This vector is constructed in the function "
                << "block_setup(...).\n"
                << std::endl;
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
#endif
 
      unsigned doftype_in_master_preconditioner_coarse_size =
        Doftype_in_master_preconditioner_coarse.size();
      for (unsigned i = 0; i < doftype_in_master_preconditioner_coarse_size;
           i++)
      {
        // The index of the sub vector we want.
        unsigned subvec_index = Doftype_in_master_preconditioner_coarse[i];
 
        // Get the corresponding most fine grain sub vector from the master
        // block preconditioner
        Vector<unsigned> tmp_master_dof_subvec =
          Master_block_preconditioner_pt->get_fine_grain_dof_types_in(
            subvec_index);
 
        Doftype_in_master_preconditioner_fine.insert(
          Doftype_in_master_preconditioner_fine.end(),
          tmp_master_dof_subvec.begin(),
          tmp_master_dof_subvec.end());
      }
 
      // The Doftype_coarsen_map_fine vector is a bit more tricky.
      // The Doftype_coarsen_map_coarse vector describes which coarse dof types
      // of THIS preconditioner are grouped together. We have to translate this
      // into the most fine grain dof types.
      //
      // For example, if
      // Doftype_coarsen_map_coarse = [[0][1,2]]
      // Doftype_in_master_preconditioner_coarse = [1,2,3]
      //
      // and the MASTER preconditioner has:
      // Doftype_coarsen_map_fine= [[0,1,2,3][4,5,6,7][8,9,10,11][12,13][14,15]]
      //
      // Then [[0][1,2]] tell us that the most fine grain DOF types 1 of the
      // master preconditioner most be grouped together, and the most fine
      // grained dof types 2 and 3 of the master preconditioner must be grouped
      // together.
      //
      // This gives the vector [[4,5,6,7] [8,9,10,11,12,13]], translating this
      // into the local DOF types of this preconditioner we have
      // Doftype_coarsen_map_fine = [[0,1,2,3][4,5,6,7,8,9]]. This corresponds
      // with the Doftype_in_master_preconditioner_fine vector we created above:
      // Doftype_in_master_preconditioner_fine = [4,5,6,7,8,9,10,11,12,13]
      //
      // Together, the master block preconditioner says to THIS subsidiary block
      // preconditioner "work on my DOF types [4,5,6,7,8,9,10,11,12,13], but
      // group your DOF type [0,1,2,3] together as DOF type 0 and [4,5,6,7,8,9]
      // together together as DOF type 1".
      //
      // Think of it like this: For each DOF type in Doftype_coarsen_map_coarse
      // we look at how many values this corresponds to in the master
      // preconditioner. In this case, Doftype_coarsen_map_coarse:
      //
      // 1 - corresponds to fine DOF types 0,1,2,3 in this preconditioner,
      // and 4,5,6,7 in the master preconditioner;
      //
      // 2 - corresponds to fine DOF types 4,5,6,7 in this preconditioner,
      // and 8,9,10,11 in the master preconditioner;
      //
      // 3 - corresponds to fine DOF types 8,9 in this preconditioner,
      // and 12,13 in the master preconditioner.
      //
      // Thus Doftype_coarsen_map_fine = [[0,1,2,3][4,5,6,7,8,9]]
      //
      ////////////////////////////////////////////////////////////////////////
      //
      // How to do this: First we create a 2D vector which has the corresponds
      // to the fine dof types in the master preconditioner but starting from
      // 0. For example, take the above example (repeated below):
      //   Passed to this prec by the master prec:
      //   Doftype_coarsen_map_coarse = [[0][1,2]]
      //   Doftype_in_master_preconditioner_coarse = [1,2,3]
      //
      // and the MASTER preconditioner has:
      // Doftype_coarsen_map_fine= [[0,1,2,3][4,5,6,7][8,9,10,11][12,13][14,15]]
      //
      // Step 1:
      // Then, the temp 2D vector we want to create is:
      // master_fine_doftype_translated = [[0 1 2 3], [4,5,6,7], [8,9]]
      // This comes from using Doftype_in_master_preconditioner_coarse
      // then get the number of fine dof types in the master.
      //
      // Step 2:
      // Then:
      //   Loop through the vector Doftype_coarsen_map_coarse,
      //     Loop over the inner vectors in Doftype_coarsen_map_coarse
      //       Each element in the inner vector corresponds to a vector in
      //       master_fine_doftype_translated. We push in the vectors of
      //       master_fine_doftype_translated intp Doftype_coarsen_map_fine
      //
 
      Vector<Vector<unsigned>> master_fine_doftype_translated;
      unsigned dof_type_index = 0;
      for (unsigned i = 0; i < doftype_in_master_preconditioner_coarse_size;
           i++)
      {
        // How many fine DOF types are in the master's
        // Doftype_in_master_preconditioner_coarse[i]?
        unsigned coarse_dof = Doftype_in_master_preconditioner_coarse[i];
 
        unsigned n_master_fine_doftypes =
          Master_block_preconditioner_pt->nfine_grain_dof_types_in(coarse_dof);
 
        Vector<unsigned> tmp_sub_vec;
        for (unsigned j = 0; j < n_master_fine_doftypes; j++)
        {
          tmp_sub_vec.push_back(dof_type_index);
          dof_type_index++;
        }
        master_fine_doftype_translated.push_back(tmp_sub_vec);
      }
 
 
      // master_fine_doftype_translated now contains vectors with values are
      // from 0, 1, 2, ..,
      //
      // Now read out the values of master_fine_doftype_translated and place
      // them in order according to Doftype_coarsen_map_coarse.
      unsigned doftype_coarsen_map_coarse_size =
        Doftype_coarsen_map_coarse.size();
      for (unsigned i = 0; i < doftype_coarsen_map_coarse_size; i++)
      {
        Vector<unsigned> tmp_vec;
        unsigned doftype_coarsen_map_coarse_i_size =
          Doftype_coarsen_map_coarse[i].size();
        for (unsigned j = 0; j < doftype_coarsen_map_coarse_i_size; j++)
        {
          unsigned subvec_i = Doftype_coarsen_map_coarse[i][j];
 
          tmp_vec.insert(tmp_vec.end(),
                         master_fine_doftype_translated[subvec_i].begin(),
                         master_fine_doftype_translated[subvec_i].end());
        }
 
        Doftype_coarsen_map_fine.push_back(tmp_vec);
      }
 
      // Get the number of block types (and DOF types) in this preconditioner
      // from the length of the dof_map vector.
      Internal_ndof_types = Doftype_in_master_preconditioner_fine.size();
 
      // Nblock_types is later updated in block_setup(...)
      Internal_nblock_types = Internal_ndof_types;
 
      // Compute number of rows in this (sub) preconditioner using data from
      // the master.
      Nrow = 0;
      for (unsigned b = 0; b < Internal_ndof_types; b++)
      {
        Nrow += this->internal_dof_block_dimension(b);
      }
 
#ifdef PARANOID
      if (Nrow == 0)
      {
        std::ostringstream error_message;
        error_message
          << "Nrow=0 in subsidiary preconditioner. This seems fishy and\n"
          << "suggests that block_setup() was not called for the \n"
          << "master block preconditioner yet.";
        throw OomphLibWarning(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
      }
#endif
    }
 
    // If this is a master block preconditioner, then set the
    // Doftype_coarsen_map_fine and Doftype_coarsen_map_coarse to the
    // identity. Recall that the Doftype_coarsen_map_fine maps the dof types
    // that this preconditioner requires with the most fine grain dof types (the
    // internal dof types) and the Doftype_coarsen_map_coarse maps the dof
    // types that this preconditioner requires with the dof types which this
    // preconditioner is given from a master preconditioner (these dof types may
    // or may not be coarsened). In the case of the master preconditioner, these
    // are the same (since dof types are not coarsened), furthermore the
    // identity mapping is provided to say that dof type 0 maps to dof type 0,
    // dof type 1 maps to dof type 1,
    // dof type 2 maps to dof type 2,
    // etc...
    //
    // If this is not a master block preconditioner, then the vectors
    // Doftype_coarsen_map_fine and Doftype_coarsen_map_coarse is handled
    // by the turn_into_subsidiary_block_preconditioner(...) function.
    if (is_master_block_preconditioner())
    {
      // How many dof types does this preconditioner work with?
      unsigned n_external_dof_types = dof_to_block_map.size();
 
      // Note: at the master level, the n_external_dof_types should be the same
      // as the internal_ndof_types(), since the dof_to_block_map MUST describe
      // the mapping between every dof type (not yet coarsened - so it is the
      // same number as the internal dof types) to the block types. But we
      // distinguish them for clarity. We also check that this is the case.
#ifdef PARANOID
      unsigned n_internal_dof_types = internal_ndof_types();
 
      if (n_internal_dof_types != n_external_dof_types)
      {
        std::ostringstream err_msg;
        err_msg
          << "The internal ndof types and the length of the dof_to_block_map\n"
          << "vector is not the same. Since this is the master block "
          << "preconditioner,\n"
          << "you must describe which block each DOF type belongs to,\n"
          << "no more, no less."
          << "internal_ndof_types = " << n_internal_dof_types << "\n"
          << "dof_to_block_map.size() = " << n_external_dof_types << "\n";
        throw OomphLibWarning(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
#endif
 
      // Clear and reserve space.
      Doftype_coarsen_map_fine.clear();
      Doftype_coarsen_map_coarse.clear();
      Doftype_coarsen_map_fine.reserve(n_external_dof_types);
      Doftype_coarsen_map_coarse.reserve(n_external_dof_types);
 
      // Now push back the identity mapping.
      for (unsigned i = 0; i < n_external_dof_types; i++)
      {
        // Create a vector and push it in.
        Vector<unsigned> tmp_vec(1, i);
        Doftype_coarsen_map_fine.push_back(tmp_vec);
        Doftype_coarsen_map_coarse.push_back(tmp_vec);
      }
    }
    else
    // Else this is a subsidiary block preconditioner.
    {
      // Both the Doftype_coarsen_map_fine and Doftype_coarsen_map_coarse
      // vectors must be already be handled by the
      // turn_into_subsidiary_block_preconditioner(...) function. We check this.
#ifdef PARANOID
      if ((Doftype_coarsen_map_fine.size() == 0) ||
          (Doftype_coarsen_map_coarse.size() == 0))
      {
        std::ostringstream err_msg;
        err_msg << "Either the Doftype_coarsen_map_fine or the \n"
                << "Doftype_coarsen_map_coarse vectors is of size 0.\n"
                << "Did you remember to call the function "
                << "turn_into_subsidiary_block_preconditioner(...)?";
        throw OomphLibWarning(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
#endif
    }
 
 
    // Now we create the vector Block_to_dof_map_coarse.
    // Recall that the vector describe which dof types are in which block with
    // the relationship:
    //
    // Block_to_dof_map_coarse[block_number] = Vector[dof types];
    //
    // Note that this is not the internal (underlying) dof type.
    // Nor is this in relation to the parent block preconditioner's dof type.
    // The number of elements in it is the same as dof_to_block_map vector.
    //
    // Since the dof type coarsening feature is added later, we encapsulate this
    // bit of the code so it does not affect things below.
    {
      // Check that the dof_to_block map "makes sense" for the
      // Doftype_coarsen_map_coarse.
      // The Doftype_coarsen_map_coarse describes which doftypes should be
      // considered as a single doftype in THIS preconditioner.
      //
      // For example, if this preconditioner is the LSC block preconditioner
      // applied to a 3D problem, it expects 4 doftypes:
      // 3 velocity, [u, v, w] and 1 pressure [p],
      // giving us the dof type ordering
      // [u v w p].
      //
      // The LSC preconditioner groups the velocity and pressure doftypes
      // separately, thus the dof_to_block_map will be:
      // [0 0 0 1]
      //
      // Then the Doftype_coarsen_map_coarse MUST have length 4, to identify
      // which of the OTHER (possibly coarsened) dof types should be grouped
      // together to be considered as a single dof types of THIS preconditioner.
      //
      // For example, if the preconditioner above this one has the dof type
      // ordering:
      // 0  1  2  3  4  5  6  7  8  9
      // ub vb wb up vp wp ut vt wt p
      // Then we want to tell THIS preconditioner which dof types belongs to
      // u, v, w and p, by providing the optional argument
      // Doftype_coarsen_map_coarse to the
      // turn_into_subsidiary_block_preconditioner(...) function
      // [[0 3 6] [1 4 7] [2 5 8] [9]]
      //
      // So, it is important that the length of dof_to_block_map is the same as
      // the length of Doftype_coarsen_map_coarse. We check this.
      unsigned dof_to_block_map_size = dof_to_block_map.size();
 
#ifdef PARANOID
      if (dof_to_block_map_size != Doftype_coarsen_map_coarse.size())
      {
        std::ostringstream err_msg;
        err_msg
          << "The size of dof_to_block_map and Doftype_coarsen_map_coarse is "
             "not "
          << "the same.\n"
          << "dof_to_block_map.size() = " << dof_to_block_map_size << "\n"
          << "Doftype_coarsen_map_coarse.size() = "
          << Doftype_coarsen_map_coarse.size() << ".\n"
          << "One of the two list is incorrect, please look at the comments\n"
          << "in the source code for more details.";
        throw OomphLibWarning(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
#endif
 
      // Create the Block_to_dof_map_coarse from
      // the dof_to_block_map and Doftype_coarsen_map_coarse.
 
      // find the maximum block number
      unsigned max_block_number =
        *std::max_element(dof_to_block_map.begin(), dof_to_block_map.end());
 
      // Now we do the following:
      // Lets say the Doftype_coarsen_map_coarse is:
      // [0 3 6]
      // [1 4 7]
      // [2 5 8]
      // [9]
      //
      // (this is the same as the above example)
      //
      // and the dof_to_block_map is [0 0 0 1].
      //
      // Then we need to form the Block_to_dof_map_coarse:
      // [0 3 6 1 4 7 2 5 8]
      // [9]
 
      // Clear anything in the Block_to_dof_map_coarse
      Block_to_dof_map_coarse.clear();
 
      const unsigned tmp_nblock = max_block_number + 1;
 
      Block_to_dof_map_coarse.resize(tmp_nblock);
 
      for (unsigned i = 0; i < dof_to_block_map_size; i++)
      {
        Block_to_dof_map_coarse[dof_to_block_map[i]].push_back(i);
      }
 
      Block_to_dof_map_fine.clear();
      Block_to_dof_map_fine.resize(tmp_nblock);
      for (unsigned block_i = 0; block_i < tmp_nblock; block_i++)
      {
        // get the dof types in this block.
        const unsigned ndof_in_block = Block_to_dof_map_coarse[block_i].size();
        for (unsigned dof_i = 0; dof_i < ndof_in_block; dof_i++)
        {
          const unsigned coarsened_dof_i =
            Block_to_dof_map_coarse[block_i][dof_i];
 
          // Insert the most fine grain dofs which this dof_i corresponds to
          // into block_i
          Vector<unsigned> dof_i_dofs =
            Doftype_coarsen_map_fine[coarsened_dof_i];
 
          Block_to_dof_map_fine[block_i].insert(
            Block_to_dof_map_fine[block_i].end(),
            dof_i_dofs.begin(),
            dof_i_dofs.end());
        }
      }
 
      // Now set the dof_to_block_map to the identify.
      // NOTE: We are now using the internal n dof types. This is because the
      // dof type coarsening feature was built on top of the existing block
      // preconditioning framework which does not handle coarsening of dof
      // types. Hence, under the hood, it still works with the most fine grain
      // dof types and does not do any coarsening.
 
      // Locally cache the internal ndof types (using access function because
      // the Internal_ndof_type variable may not be set up yet if this is a
      // master preconditioner).
      unsigned tmp_internal_ndof_types = internal_ndof_types();
 
      dof_to_block_map.resize(tmp_internal_ndof_types, 0);
 
      for (unsigned i = 0; i < tmp_internal_ndof_types; i++)
      {
        dof_to_block_map[i] = i;
      }
    } // end of Block_to_dof_map_coarse encapsulation
 
#ifdef PARANOID
 
    // Check that the meshes are ok. This only needs to be done in the master
    // because subsidiary preconditioners don't do anything with the meshes
    // here.
    if (is_master_block_preconditioner())
    {
      // This is declared as local_nmesh because there are other variables
      // called nmesh floating about... but this will not exist if PARANOID is
      // switched on.
      unsigned local_nmesh = nmesh();
 
      // Check that some mesh pointers have been assigned.
      if (local_nmesh == 0)
      {
        std::ostringstream error_msg;
        error_msg << "Cannot setup blocks because no meshes have been set.";
        throw OomphLibError(
          error_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
 
      // Each mesh must contain elements with the same number of dof.
      // A stricter check is to ensure that the mesh contains only one type of
      // elements. This is relaxed in same cases.
      for (unsigned mesh_i = 0; mesh_i < local_nmesh; mesh_i++)
      {
        // The number of elements in the current mesh.
        unsigned n_element = mesh_pt(mesh_i)->nelement();
 
        // When the bulk mesh is distributed, there may not be any elements
        // in the surface mesh(es).
        if (n_element > 0)
        {
          // The string of the first element in the current mesh.
          std::string first_element_string =
            typeid(*(mesh_pt(mesh_i)->element_pt(0))).name();
 
          // If there are multiple element types in the current mesh,
          // we can at least make sure that they contain the same types of DOFs.
          if (bool(Allow_multiple_element_type_in_mesh[mesh_i]))
          {
            // The ndof types of the first element.
            unsigned first_element_ndof_type =
              mesh_pt(mesh_i)->element_pt(0)->ndof_types();
 
            // Loop through the meshes and compare the number of types of DOFs.
            for (unsigned el_i = 1; el_i < n_element; el_i++)
            {
              // The ndof type of the current element.
              unsigned current_element_ndof_type =
                mesh_pt(mesh_i)->element_pt(el_i)->ndof_types();
 
              // The string of the current element.
              std::string current_element_string =
                typeid(*(mesh_pt(mesh_i)->element_pt(el_i))).name();
 
              // Compare against the first element.
              if (current_element_ndof_type != first_element_ndof_type)
              {
                std::ostringstream error_message;
                error_message
                  << "Elements in the same mesh MUST have the same number of "
                     "types "
                  << "of DOFs.\n"
                  << "The element in mesh " << mesh_i << ", at position "
                  << el_i << " is: \n"
                  << current_element_string << ", \n"
                  << "with ndof types: " << current_element_ndof_type << ".\n"
                  << "The first element in the same mesh is: \n"
                  << first_element_string << ", \n"
                  << "with ndof types: " << first_element_ndof_type << ".\n";
                throw OomphLibError(error_message.str(),
                                    OOMPH_CURRENT_FUNCTION,
                                    OOMPH_EXCEPTION_LOCATION);
              }
            }
          }
          else
          // There should be only one type of elements in the current mesh.
          // Check that this is the case!
          {
            // Loop through the elements in the current mesh.
            for (unsigned el_i = 1; el_i < n_element; el_i++)
            {
              // The string of the current element.
              std::string current_element_string =
                typeid(*(mesh_pt(mesh_i)->element_pt(el_i))).name();
 
              // Compare against the first element.
              if (current_element_string.compare(first_element_string) != 0)
              {
                std::ostringstream error_message;
                error_message
                  << "By default, a mesh containing block preconditionable "
                  << "elements must contain only one type of element.\n"
                  << "The element in mesh " << mesh_i << ", at position "
                  << el_i << " is: \n"
                  << current_element_string << "\n"
                  << "The first element in the same mesh is: \n"
                  << first_element_string << "\n"
                  << "If this is correct, consider calling the set_mesh(...) "
                     "with\n"
                  << "the optional argument set true to allow multiple "
                     "element\n"
                  << "types in the same mesh.\n"
                  << "Note: A minimal requirement is that the elements in the "
                     "same\n"
                  << "mesh MUST have the same number of DOF types.";
                throw OomphLibError(error_message.str(),
                                    OOMPH_CURRENT_FUNCTION,
                                    OOMPH_EXCEPTION_LOCATION);
              }
            }
          }
        }
      }
    }
 
#endif
    // clear the memory
    this->clear_block_preconditioner_base();
 
    // get my_rank and nproc
#ifdef OOMPH_HAS_MPI
    unsigned my_rank = comm_pt()->my_rank();
    unsigned nproc = comm_pt()->nproc();
#endif
 
 
    /////////////////////////////////////////////////////////////////////////////
    // start of master block preconditioner only operations
    /////////////////////////////////////////////////////////////////////////////
#ifdef OOMPH_HAS_MPI
    unsigned* nreq_sparse = new unsigned[nproc]();
    unsigned* nreq_sparse_for_proc = new unsigned[nproc]();
    unsigned** index_in_dof_block_sparse_send = new unsigned*[nproc]();
    unsigned** dof_number_sparse_send = new unsigned*[nproc]();
    Vector<MPI_Request> send_requests_sparse;
    Vector<MPI_Request> recv_requests_sparse;
#endif
 
    // If this preconditioner is the master preconditioner then we need
    // to assemble the vectors : Dof_number
    //                           Index_in_dof_block
    if (is_master_block_preconditioner())
    {
      // Get the number of dof types in each mesh.
      Ndof_types_in_mesh.assign(nmesh(), 0);
      for (unsigned i = 0; i < nmesh(); i++)
      {
        Ndof_types_in_mesh[i] = mesh_pt(i)->ndof_types();
      }
      // Setup the distribution of this preconditioner, assumed to be the same
      // as the matrix if the matrix is distributable.
      if (dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt()))
      {
        this->build_distribution(
          dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt())
            ->distribution_pt());
      }
      else
      {
        LinearAlgebraDistribution dist(comm_pt(), matrix_pt()->nrow(), false);
        this->build_distribution(dist);
      }
      Nrow = matrix_pt()->nrow();
 
      // Boolean to indicate whether the matrix is actually distributed,
      // ie distributed and on more than one processor.
      bool matrix_distributed =
        (this->distribution_pt()->distributed() &&
         this->distribution_pt()->communicator_pt()->nproc() > 1);
 
 
      // Matrix must be a CR matrix.
      CRDoubleMatrix* cr_matrix_pt = dynamic_cast<CRDoubleMatrix*>(matrix_pt());
 
      if (cr_matrix_pt == 0)
      {
        std::ostringstream error_message;
        error_message << "Block setup for distributed matrices only works "
                      << "for CRDoubleMatrices";
        throw OomphLibError(error_message.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
 
 
      // Get distribution.
      unsigned first_row = this->distribution_pt()->first_row();
      unsigned nrow_local = this->distribution_pt()->nrow_local();
      unsigned last_row = first_row + nrow_local - 1;
 
#ifdef OOMPH_HAS_MPI
      // storage for the rows required by each processor in the dense
      // block lookup storage scheme
      // dense_required_rows(p,0) is the minimum global index required by proc p
      //                 ...(p,1) is the maximum global index required by proc p
      DenseMatrix<unsigned> dense_required_rows(nproc, 2);
      for (unsigned p = 0; p < nproc; p++)
      {
        dense_required_rows(p, 0) = this->distribution_pt()->first_row(p);
        dense_required_rows(p, 1) = this->distribution_pt()->first_row(p) +
                                    this->distribution_pt()->nrow_local(p) - 1;
      }
 
      // determine the global rows That are not in the range first_row to
      // first_row+nrow_local for which we should store the
      // Dof_index and Index_in_dof_block for
      // then send the lists to other processors
      std::set<unsigned> sparse_global_rows_for_block_lookup;
      if (matrix_distributed)
      {
        unsigned nnz = cr_matrix_pt->nnz();
        int* column_index = cr_matrix_pt->column_index();
        for (unsigned i = 0; i < nnz; i++)
        {
          unsigned ci = column_index[i];
          if (ci < first_row || ci > last_row)
          {
            sparse_global_rows_for_block_lookup.insert(ci);
          }
        }
      }
 
      int nsparse = sparse_global_rows_for_block_lookup.size();
 
      Global_index_sparse.resize(0);
      std::copy(sparse_global_rows_for_block_lookup.begin(),
                sparse_global_rows_for_block_lookup.end(),
                std::back_inserter(Global_index_sparse));
 
      Index_in_dof_block_sparse.resize(nsparse);
      Dof_number_sparse.resize(nsparse);
      sparse_global_rows_for_block_lookup.clear();
 
      Vector<MPI_Request> recv_requests_sparse_nreq;
      if (matrix_distributed)
      {
        MPI_Aint base_displacement_sparse;
        MPI_Get_address(nreq_sparse, &base_displacement_sparse);
 
        int zero = 0;
        for (unsigned p = 0; p < nproc; p++)
        {
          // determine the global eqn numbers required by this processor
          // that can be classified by processor p
          int begin = 0;
          for (int i = 0; i < nsparse; ++i)
          {
            if (Global_index_sparse[i] < dense_required_rows(p, 0))
            {
              ++begin;
            }
            else
            {
              if (Global_index_sparse[i] <= dense_required_rows(p, 1))
              {
                ++nreq_sparse[p];
              }
              else
              {
                break;
              }
            }
          }
 
          // if this processor has rows to be classified by proc p
          if (nreq_sparse[p] > 0)
          {
            // send the number of global eqn numbers
            MPI_Request req1;
            MPI_Isend(&nreq_sparse[p],
                      1,
                      MPI_UNSIGNED,
                      p,
                      31,
                      comm_pt()->mpi_comm(),
                      &req1);
            send_requests_sparse.push_back(req1);
 
            // send the global eqn numbers
            MPI_Request req2;
            MPI_Isend(&Global_index_sparse[begin],
                      nreq_sparse[p],
                      MPI_UNSIGNED,
                      p,
                      32,
                      comm_pt()->mpi_comm(),
                      &req2);
            send_requests_sparse.push_back(req2);
 
            // post the recvs for the data that will be returned
 
            // the datatypes, displacements, lengths for the two datatypes
            MPI_Datatype types[2];
            MPI_Aint displacements[2];
            int lengths[2];
 
            // index in dof block
            MPI_Type_contiguous(nreq_sparse[p], MPI_UNSIGNED, &types[0]);
            MPI_Type_commit(&types[0]);
            MPI_Get_address(&Index_in_dof_block_sparse[begin],
                            &displacements[0]);
            displacements[0] -= base_displacement_sparse;
            lengths[0] = 1;
 
            // dof number
            MPI_Type_contiguous(nreq_sparse[p], MPI_UNSIGNED, &types[1]);
            MPI_Type_commit(&types[1]);
            MPI_Get_address(&Dof_number_sparse[begin], &displacements[1]);
            displacements[1] -= base_displacement_sparse;
            lengths[1] = 1;
 
            // build the final type
            MPI_Datatype recv_type;
            MPI_Type_create_struct(
              2, lengths, displacements, types, &recv_type);
            MPI_Type_commit(&recv_type);
            MPI_Type_free(&types[0]);
            MPI_Type_free(&types[1]);
 
            // and recv
            MPI_Request req;
            MPI_Irecv(
              nreq_sparse, 1, recv_type, p, 33, comm_pt()->mpi_comm(), &req);
            recv_requests_sparse.push_back(req);
            MPI_Type_free(&recv_type);
          }
 
          // if no communication required, confirm this
          if (nreq_sparse[p] == 0)
          {
            MPI_Request req1;
            MPI_Isend(&zero, 1, MPI_INT, p, 31, comm_pt()->mpi_comm(), &req1);
            send_requests_sparse.push_back(req1);
          }
 
          //
          MPI_Request req;
          MPI_Irecv(&nreq_sparse_for_proc[p],
                    1,
                    MPI_UNSIGNED,
                    p,
                    31,
                    comm_pt()->mpi_comm(),
                    &req);
          recv_requests_sparse_nreq.push_back(req);
        }
      }
#endif
 
      // resize the storage
      Dof_number_dense.resize(nrow_local);
      Index_in_dof_block_dense.resize(nrow_local);
 
      // zero the number of dof types
      Internal_ndof_types = 0;
 
#ifdef PARANOID
      // Vector to keep track of previously assigned block numbers
      // to check consistency between multiple assignments.
      Vector<int> previously_assigned_block_number(nrow_local,
                                                   Data::Is_unclassified);
#endif
 
      // determine whether the problem is distribution
      bool problem_distributed = false;
 
      // the problem method distributed() is only accessible with MPI
#ifdef OOMPH_HAS_MPI
      problem_distributed = any_mesh_distributed();
#endif
 
      // if the problem is not distributed
      if (!problem_distributed)
      {
        // Offset for the block type in the overall system.
        // Different meshes contain different block-preconditionable
        // elements -- their blocks are added one after the other.
        unsigned dof_offset = 0;
 
        // Loop over all meshes.
        for (unsigned m = 0; m < nmesh(); m++)
        {
          // Number of elements in this mesh.
          unsigned n_element = mesh_pt(m)->nelement();
 
          // Find the number of block types that the elements in this mesh
          // are in charge of.
          unsigned ndof_in_element = ndof_types_in_mesh(m);
          Internal_ndof_types += ndof_in_element;
 
          for (unsigned e = 0; e < n_element; e++)
          {
            // List containing pairs of global equation number and
            // dof number for each global dof in an element.
            std::list<std::pair<unsigned long, unsigned>> dof_lookup_list;
 
            // Get list of blocks associated with the element's global unknowns.
            mesh_pt(m)->element_pt(e)->get_dof_numbers_for_unknowns(
              dof_lookup_list);
 
            // Loop over all entries in the list
            // and store the block number.
            typedef std::list<std::pair<unsigned long, unsigned>>::iterator IT;
            for (IT it = dof_lookup_list.begin(); it != dof_lookup_list.end();
                 it++)
            {
              unsigned long global_dof = it->first;
              if (global_dof >= unsigned(first_row) &&
                  global_dof <= unsigned(last_row))
              {
                unsigned dof_number = (it->second) + dof_offset;
                Dof_number_dense[global_dof - first_row] = dof_number;
 
#ifdef PARANOID
                // Check consistency of block numbers if assigned multiple times
                if (previously_assigned_block_number[global_dof - first_row] <
                    0)
                {
                  previously_assigned_block_number[global_dof - first_row] =
                    dof_number;
                }
#endif
              }
            }
          }
 
          // About to do the next mesh which contains block preconditionable
          // elements of a different type; all the dofs that these elements are
          // "in charge of" differ from the ones considered so far.
          // Bump up the block counter to make sure we're not overwriting
          // anything here
          dof_offset += ndof_in_element;
        }
 
#ifdef PARANOID
        // check that every global equation number has been allocated a dof type
        for (unsigned i = 0; i < nrow_local; i++)
        {
          if (previously_assigned_block_number[i] < 0)
          {
            std::ostringstream error_message;
            error_message << "Not all degrees of freedom have had DOF type "
                          << "numbers allocated. Dof number " << i
                          << " is unallocated.";
            throw OomphLibError(error_message.str(),
                                OOMPH_CURRENT_FUNCTION,
                                OOMPH_EXCEPTION_LOCATION);
          }
        }
#endif
      }
      // else the problem is distributed
      else
      {
#ifdef OOMPH_HAS_MPI
        // Offset for the block type in the overall system.
        // Different meshes contain different block-preconditionable
        // elements -- their blocks are added one after the other...
        unsigned dof_offset = 0;
 
        // the set of global degrees of freedom and their corresponding dof
        // number on this processor
        std::map<unsigned long, unsigned> my_dof_map;
 
        // Loop over all meshes
        for (unsigned m = 0; m < nmesh(); m++)
        {
          // Number of elements in this mesh
          unsigned n_element = this->mesh_pt(m)->nelement();
 
          // Find the number of block types that the elements in this mesh
          // are in charge of
          unsigned ndof_in_element = ndof_types_in_mesh(m);
          Internal_ndof_types += ndof_in_element;
 
          // Loop over all elements
          for (unsigned e = 0; e < n_element; e++)
          {
            // if the element is not a halo element
            if (!this->mesh_pt(m)->element_pt(e)->is_halo())
            {
              // List containing pairs of global equation number and
              // dof number for each global dof in an element
              std::list<std::pair<unsigned long, unsigned>> dof_lookup_list;
 
              // Get list of blocks associated with the element's global
              // unknowns
              this->mesh_pt(m)->element_pt(e)->get_dof_numbers_for_unknowns(
                dof_lookup_list);
 
              // update the block numbers and put it in the map.
              typedef std::list<std::pair<unsigned long, unsigned>>::iterator
                IT;
              for (IT it = dof_lookup_list.begin(); it != dof_lookup_list.end();
                   it++)
              {
                it->second = (it->second) + dof_offset;
                my_dof_map[it->first] = it->second;
              }
            }
          }
 
          // About to do the next mesh which contains block preconditionable
          // elements of a different type; all the dofs that these elements are
          // "in charge of" differ from the ones considered so far.
          // Bump up the block counter to make sure we're not overwriting
          // anything here
          dof_offset += ndof_in_element;
        }
 
        // next copy the map of my dofs to two vectors to send
        unsigned my_ndof = my_dof_map.size();
        unsigned long* my_global_dofs = new unsigned long[my_ndof];
        unsigned* my_dof_numbers = new unsigned[my_ndof];
        typedef std::map<unsigned long, unsigned>::iterator IT;
        unsigned pt = 0;
        for (IT it = my_dof_map.begin(); it != my_dof_map.end(); it++)
        {
          my_global_dofs[pt] = it->first;
          my_dof_numbers[pt] = it->second;
          pt++;
        }
 
        // and then clear the map
        my_dof_map.clear();
 
        // count up how many DOFs need to be sent to each processor
        int* first_dof_to_send = new int[nproc];
        int* ndof_to_send = new int[nproc];
        unsigned ptr = 0;
        for (unsigned p = 0; p < nproc; p++)
        {
          first_dof_to_send[p] = 0;
          ndof_to_send[p] = 0;
          while (ptr < my_ndof &&
                 my_global_dofs[ptr] < dense_required_rows(p, 0))
          {
            ptr++;
          }
          first_dof_to_send[p] = ptr;
          while (ptr < my_ndof &&
                 my_global_dofs[ptr] <= dense_required_rows(p, 1))
          {
            ndof_to_send[p]++;
            ptr++;
          }
        }
 
        // next communicate to each processor how many dofs it expects to recv
        int* ndof_to_recv = new int[nproc];
        MPI_Alltoall(ndof_to_send,
                     1,
                     MPI_INT,
                     ndof_to_recv,
                     1,
                     MPI_INT,
                     comm_pt()->mpi_comm());
 
        // the base displacements for the sends
        MPI_Aint base_displacement;
        MPI_Get_address(my_global_dofs, &base_displacement);
 
#ifdef PARANOID
        // storage for paranoid check to ensure that every row as been
        // imported
        std::vector<bool> dof_recv(nrow_local, false);
#endif
 
        // next send and recv
        Vector<MPI_Request> send_requests;
        Vector<MPI_Request> recv_requests;
        Vector<unsigned long*> global_dofs_recv(nproc, 0);
        Vector<unsigned*> dof_numbers_recv(nproc, 0);
        Vector<unsigned> proc;
        for (unsigned p = 0; p < nproc; p++)
        {
          if (p != my_rank)
          {
            // send
            if (ndof_to_send[p] > 0)
            {
              // the datatypes, displacements, lengths for the two datatypes
              MPI_Datatype types[2];
              MPI_Aint displacements[2];
              int lengths[2];
 
              // my global dofs
              MPI_Type_contiguous(
                ndof_to_send[p], MPI_UNSIGNED_LONG, &types[0]);
              MPI_Type_commit(&types[0]);
              MPI_Get_address(my_global_dofs + first_dof_to_send[p],
                              &displacements[0]);
              displacements[0] -= base_displacement;
              lengths[0] = 1;
 
              // my dof numbers
              MPI_Type_contiguous(ndof_to_send[p], MPI_UNSIGNED, &types[1]);
              MPI_Type_commit(&types[1]);
              MPI_Get_address(my_dof_numbers + first_dof_to_send[p],
                              &displacements[1]);
              displacements[1] -= base_displacement;
              lengths[1] = 1;
 
              // build the final type
              MPI_Datatype send_type;
              MPI_Type_create_struct(
                2, lengths, displacements, types, &send_type);
              MPI_Type_commit(&send_type);
              MPI_Type_free(&types[0]);
              MPI_Type_free(&types[1]);
 
              // and send
              MPI_Request req;
              MPI_Isend(my_global_dofs,
                        1,
                        send_type,
                        p,
                        2,
                        comm_pt()->mpi_comm(),
                        &req);
              send_requests.push_back(req);
              MPI_Type_free(&send_type);
            }
 
            // and recv
            if (ndof_to_recv[p] > 0)
            {
              // resize the storage
              global_dofs_recv[p] = new unsigned long[ndof_to_recv[p]];
              dof_numbers_recv[p] = new unsigned[ndof_to_recv[p]];
              proc.push_back(p);
 
              // the datatypes, displacements, lengths for the two datatypes
              MPI_Datatype types[2];
              MPI_Aint displacements[2];
              int lengths[2];
 
              // my global dofs
              MPI_Type_contiguous(
                ndof_to_recv[p], MPI_UNSIGNED_LONG, &types[0]);
              MPI_Type_commit(&types[0]);
              MPI_Get_address(global_dofs_recv[p], &displacements[0]);
              displacements[0] -= base_displacement;
              lengths[0] = 1;
 
              // my dof numbers
              MPI_Type_contiguous(ndof_to_recv[p], MPI_UNSIGNED, &types[1]);
              MPI_Type_commit(&types[1]);
              MPI_Get_address(dof_numbers_recv[p], &displacements[1]);
              displacements[1] -= base_displacement;
              lengths[1] = 1;
 
              // build the final type
              MPI_Datatype recv_type;
              MPI_Type_create_struct(
                2, lengths, displacements, types, &recv_type);
              MPI_Type_commit(&recv_type);
              MPI_Type_free(&types[0]);
              MPI_Type_free(&types[1]);
 
              // and recv
              MPI_Request req;
              MPI_Irecv(my_global_dofs,
                        1,
                        recv_type,
                        p,
                        2,
                        comm_pt()->mpi_comm(),
                        &req);
              recv_requests.push_back(req);
              MPI_Type_free(&recv_type);
            }
          }
          // send to self
          else
          {
            unsigned u = first_dof_to_send[p] + ndof_to_recv[p];
            for (unsigned i = first_dof_to_send[p]; i < u; i++)
            {
#ifdef PARANOID
              // indicate that this dof has ben recv
              dof_recv[my_global_dofs[i] - first_row] = true;
#endif
              Dof_number_dense[my_global_dofs[i] - first_row] =
                my_dof_numbers[i];
            }
          }
        }
 
        // recv and store the data
        unsigned c_recv = recv_requests.size();
        while (c_recv > 0)
        {
          // wait for any communication to finish
          int req_number;
          MPI_Waitany(
            c_recv, &recv_requests[0], &req_number, MPI_STATUS_IGNORE);
          recv_requests.erase(recv_requests.begin() + req_number);
          c_recv--;
 
          // determine the source processor
          unsigned p = proc[req_number];
          proc.erase(proc.begin() + req_number);
 
          // import the data
          for (int i = 0; i < ndof_to_recv[p]; i++)
          {
#ifdef PARANOID
            // indicate that this dof has ben recv
            dof_recv[global_dofs_recv[p][i] - first_row] = true;
#endif
            Dof_number_dense[global_dofs_recv[p][i] - first_row] =
              dof_numbers_recv[p][i];
          }
 
          // delete the data
          delete[] global_dofs_recv[p];
          delete[] dof_numbers_recv[p];
        }
 
        // finally wait for the send requests to complete as we are leaving
        // an MPI block of code
        unsigned csr = send_requests.size();
        if (csr)
        {
          MPI_Waitall(csr, &send_requests[0], MPI_STATUS_IGNORE);
        }
 
        // clean up
        delete[] ndof_to_send;
        delete[] first_dof_to_send;
        delete[] ndof_to_recv;
        delete[] my_global_dofs;
        delete[] my_dof_numbers;
#ifdef PARANOID
        unsigned all_recv = true;
        for (unsigned i = 0; i < nrow_local; i++)
        {
          if (!dof_recv[i])
          {
            all_recv = false;
          }
        }
        if (!all_recv)
        {
          std::ostringstream error_message;
          error_message << "Not all the DOF numbers required were received";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
        }
#endif
#else
        std::ostringstream error_message;
        error_message
          << "The problem appears to be distributed, MPI is required";
        throw OomphLibError(error_message.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
#endif
      }
#ifdef OOMPH_HAS_MPI
      Vector<unsigned*> sparse_rows_for_proc(nproc, 0);
      Vector<MPI_Request> sparse_rows_for_proc_requests;
      if (matrix_distributed)
      {
        // wait for number of sparse rows each processor requires
        // post recvs for that data
        if (recv_requests_sparse_nreq.size() > 0)
        {
          MPI_Waitall(recv_requests_sparse_nreq.size(),
                      &recv_requests_sparse_nreq[0],
                      MPI_STATUS_IGNORE);
        }
        for (unsigned p = 0; p < nproc; ++p)
        {
          if (nreq_sparse_for_proc[p] > 0)
          {
            MPI_Request req;
            sparse_rows_for_proc[p] = new unsigned[nreq_sparse_for_proc[p]];
            MPI_Irecv(sparse_rows_for_proc[p],
                      nreq_sparse_for_proc[p],
                      MPI_UNSIGNED,
                      p,
                      32,
                      comm_pt()->mpi_comm(),
                      &req);
            sparse_rows_for_proc_requests.push_back(req);
          }
        }
      }
#endif
 
 
      // for every global degree of freedom required by this processor we now
      // have the corresponding dof number
 
      // clear the Ndof_in_dof_block storage
      Dof_dimension.assign(Internal_ndof_types, 0);
 
      // first consider a non distributed matrix
      if (!matrix_distributed)
      {
        // set the Index_in_dof_block
        unsigned nrow = this->distribution_pt()->nrow();
        Index_in_dof_block_dense.resize(nrow);
        Index_in_dof_block_dense.initialise(0);
        for (unsigned i = 0; i < nrow; i++)
        {
          Index_in_dof_block_dense[i] = Dof_dimension[Dof_number_dense[i]];
          Dof_dimension[Dof_number_dense[i]]++;
        }
      }
 
      // next a distributed matrix
      else
      {
#ifdef OOMPH_HAS_MPI
 
 
        // first compute how many instances of each dof are on this
        // processor
        unsigned* my_nrows_in_dof_block = new unsigned[Internal_ndof_types];
        for (unsigned i = 0; i < Internal_ndof_types; i++)
        {
          my_nrows_in_dof_block[i] = 0;
        }
        for (unsigned i = 0; i < nrow_local; i++)
        {
          my_nrows_in_dof_block[Dof_number_dense[i]]++;
        }
 
        // next share the data
        unsigned* nrow_in_dof_block_recv =
          new unsigned[Internal_ndof_types * nproc];
        MPI_Allgather(my_nrows_in_dof_block,
                      Internal_ndof_types,
                      MPI_UNSIGNED,
                      nrow_in_dof_block_recv,
                      Internal_ndof_types,
                      MPI_UNSIGNED,
                      comm_pt()->mpi_comm());
        delete[] my_nrows_in_dof_block;
 
        // compute my first dof index and Nrows_in_dof_block
        Vector<unsigned> my_first_dof_index(Internal_ndof_types, 0);
        for (unsigned i = 0; i < Internal_ndof_types; i++)
        {
          for (unsigned p = 0; p < my_rank; p++)
          {
            my_first_dof_index[i] +=
              nrow_in_dof_block_recv[p * Internal_ndof_types + i];
          }
          Dof_dimension[i] = my_first_dof_index[i];
          for (unsigned p = my_rank; p < nproc; p++)
          {
            Dof_dimension[i] +=
              nrow_in_dof_block_recv[p * Internal_ndof_types + i];
          }
        }
        delete[] nrow_in_dof_block_recv;
 
        // next compute Index in dof block
        Index_in_dof_block_dense.resize(nrow_local);
        Index_in_dof_block_dense.initialise(0);
        Vector<unsigned> dof_counter(Internal_ndof_types, 0);
        for (unsigned i = 0; i < nrow_local; i++)
        {
          Index_in_dof_block_dense[i] =
            my_first_dof_index[Dof_number_dense[i]] +
            dof_counter[Dof_number_dense[i]];
          dof_counter[Dof_number_dense[i]]++;
        }
 
        // the base displacements for the sends
        if (sparse_rows_for_proc_requests.size() > 0)
        {
          MPI_Waitall(sparse_rows_for_proc_requests.size(),
                      &sparse_rows_for_proc_requests[0],
                      MPI_STATUS_IGNORE);
        }
        MPI_Aint base_displacement;
        MPI_Get_address(dof_number_sparse_send, &base_displacement);
        unsigned first_row = this->distribution_pt()->first_row();
        for (unsigned p = 0; p < nproc; ++p)
        {
          if (nreq_sparse_for_proc[p] > 0)
          {
            // construct the data
            index_in_dof_block_sparse_send[p] =
              new unsigned[nreq_sparse_for_proc[p]];
            dof_number_sparse_send[p] = new unsigned[nreq_sparse_for_proc[p]];
            for (unsigned i = 0; i < nreq_sparse_for_proc[p]; ++i)
            {
              unsigned r = sparse_rows_for_proc[p][i];
              r -= first_row;
              index_in_dof_block_sparse_send[p][i] =
                Index_in_dof_block_dense[r];
              dof_number_sparse_send[p][i] = Dof_number_dense[r];
            }
            delete[] sparse_rows_for_proc[p];
 
            // send the data
            // the datatypes, displacements, lengths for the two datatypes
            MPI_Datatype types[2];
            MPI_Aint displacements[2];
            int lengths[2];
 
            // index in dof block
            MPI_Type_contiguous(
              nreq_sparse_for_proc[p], MPI_UNSIGNED, &types[0]);
            MPI_Type_commit(&types[0]);
            MPI_Get_address(index_in_dof_block_sparse_send[p],
                            &displacements[0]);
            displacements[0] -= base_displacement;
            lengths[0] = 1;
 
            // dof number
            MPI_Type_contiguous(
              nreq_sparse_for_proc[p], MPI_UNSIGNED, &types[1]);
            MPI_Type_commit(&types[1]);
            MPI_Get_address(dof_number_sparse_send[p], &displacements[1]);
            displacements[1] -= base_displacement;
            lengths[1] = 1;
 
            // build the final type
            MPI_Datatype send_type;
            MPI_Type_create_struct(
              2, lengths, displacements, types, &send_type);
            MPI_Type_commit(&send_type);
            MPI_Type_free(&types[0]);
            MPI_Type_free(&types[1]);
 
            // and recv
            MPI_Request req;
            MPI_Isend(dof_number_sparse_send,
                      1,
                      send_type,
                      p,
                      33,
                      comm_pt()->mpi_comm(),
                      &req);
            send_requests_sparse.push_back(req);
            MPI_Type_free(&send_type);
          }
          else
          {
            index_in_dof_block_sparse_send[p] = 0;
            dof_number_sparse_send[p] = 0;
          }
        }
#endif
      }
    }
 
    /////////////////////////////////////////////////////////////////////////////
    // end of master block preconditioner only operations
    /////////////////////////////////////////////////////////////////////////////
 
    // compute the number of rows in each block
 
#ifdef PARANOID
    // check the vector is the correct length
    if (dof_to_block_map.size() != Internal_ndof_types)
    {
      std::ostringstream error_message;
      error_message << "The dof_to_block_map vector (size="
                    << dof_to_block_map.size()
                    << ") must be of size Internal_ndof_types="
                    << Internal_ndof_types;
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // find the maximum block number RAYAY use std::max_element
    unsigned max_block_number = 0;
    for (unsigned i = 0; i < Internal_ndof_types; i++)
    {
      if (dof_to_block_map[i] > max_block_number)
      {
        max_block_number = dof_to_block_map[i];
      }
    }
 
    // resize the storage the the block to dof map
    Block_number_to_dof_number_lookup.clear();
    Block_number_to_dof_number_lookup.resize(max_block_number + 1);
    Ndof_in_block.clear();
    Ndof_in_block.resize(max_block_number + 1);
 
    // resize storage
    Dof_number_to_block_number_lookup.resize(Internal_ndof_types);
 
    // build the storage for the two maps (block to dof) and (dof to block)
    for (unsigned i = 0; i < Internal_ndof_types; i++)
    {
      Dof_number_to_block_number_lookup[i] = dof_to_block_map[i];
      Block_number_to_dof_number_lookup[dof_to_block_map[i]].push_back(i);
      Ndof_in_block[dof_to_block_map[i]]++;
    }
 
#ifdef PARANOID
    // paranoid check that every block number has at least one DOF associated
    // with it
    for (unsigned i = 0; i < max_block_number + 1; i++)
    {
      if (Block_number_to_dof_number_lookup[i].size() == 0)
      {
        std::ostringstream error_message;
        error_message << "block number " << i
                      << " does not have any DOFs associated with it";
        throw OomphLibWarning(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // Update the number of blocks types.
    Internal_nblock_types = max_block_number + 1;
 
    // Distributed or not, depends on if we have more than one processor.
    bool distributed = this->master_distribution_pt()->distributed();
 
    // Create the new block distributions.
    Internal_block_distribution_pt.resize(Internal_nblock_types);
    for (unsigned i = 0; i < Internal_nblock_types; i++)
    {
      unsigned block_dim = 0;
      for (unsigned j = 0; j < Ndof_in_block[i]; j++)
      {
        block_dim +=
          internal_dof_block_dimension(Block_number_to_dof_number_lookup[i][j]);
      }
      Internal_block_distribution_pt[i] =
        new LinearAlgebraDistribution(comm_pt(), block_dim, distributed);
    }
 
    // Work out the distribution of the dof-level blocks.
    // Since several dof types may be coarsened into a single dof type.
    // We get the dof-level block distributions from the parent preconditioner.
 
    // How many dof types are there?
    if (is_subsidiary_block_preconditioner())
    {
      // Delete any pre-existing distributions.
      const unsigned dof_block_distribution_size =
        Dof_block_distribution_pt.size();
      for (unsigned dof_i = 0; dof_i < dof_block_distribution_size; dof_i++)
      {
        delete Dof_block_distribution_pt[dof_i];
      }
      const unsigned ndofs = this->ndof_types();
      Dof_block_distribution_pt.resize(ndofs, 0);
 
      // For each dof type, work out how many parent preconditioner dof types
      // are in it.
      for (unsigned dof_i = 0; dof_i < ndofs; dof_i++)
      {
        // For each external dof, we get the dofs coarsened into it (from the
        // parent preconditioner level, not the most fine grain level).
        const unsigned ncoarsened_dofs_in_dof_i =
          Doftype_coarsen_map_coarse[dof_i].size();
        Vector<LinearAlgebraDistribution*> tmp_dist_pt(ncoarsened_dofs_in_dof_i,
                                                       0);
        for (unsigned parent_dof_i = 0; parent_dof_i < ncoarsened_dofs_in_dof_i;
             parent_dof_i++)
        {
          tmp_dist_pt[parent_dof_i] =
            master_block_preconditioner_pt()->dof_block_distribution_pt(
              Doftype_in_master_preconditioner_coarse
                [Doftype_coarsen_map_coarse[dof_i][parent_dof_i]]);
        }
 
        Dof_block_distribution_pt[dof_i] = new LinearAlgebraDistribution;
 
 
        LinearAlgebraDistributionHelpers::concatenate(
          tmp_dist_pt, *Dof_block_distribution_pt[dof_i]);
      }
    }
 
    // Create Block_distribution_pt
    {
      // Delete any existing distributions in Block_distribution_pt.
      // (This should already be deleted in clear_block_preconditioner_base(...)
      // but we are just being extra safe!).
      unsigned n_existing_block_dist = Block_distribution_pt.size();
      for (unsigned dist_i = 0; dist_i < n_existing_block_dist; dist_i++)
      {
        delete Block_distribution_pt[dist_i];
      }
 
      Block_distribution_pt.clear();
 
      // Work out the distributions of the concatenated blocks.
      unsigned super_block_size = Block_to_dof_map_coarse.size();
      Block_distribution_pt.resize(super_block_size, 0);
      for (unsigned super_block_i = 0; super_block_i < super_block_size;
           super_block_i++)
      {
        unsigned sub_block_size = Block_to_dof_map_coarse[super_block_i].size();
        Vector<LinearAlgebraDistribution*> tmp_dist_pt(sub_block_size, 0);
 
        for (unsigned sub_block_i = 0; sub_block_i < sub_block_size;
             sub_block_i++)
        {
          tmp_dist_pt[sub_block_i] = dof_block_distribution_pt(
            Block_to_dof_map_coarse[super_block_i][sub_block_i]);
        }
 
        Block_distribution_pt[super_block_i] = new LinearAlgebraDistribution;
 
        LinearAlgebraDistributionHelpers::concatenate(
          tmp_dist_pt, *Block_distribution_pt[super_block_i]);
      }
 
    } // Creating Block_distribution_pt.
 
 
    // Create the distribution of the preconditioner matrix,
    // if this preconditioner is a subsidiary preconditioner then it stored
    // at Distribution_pt;
    // if this preconditioner is a master preconditioner then it is stored
    // at Internal_preconditioner_matrix_distribution_pt.
    LinearAlgebraDistribution dist;
    LinearAlgebraDistributionHelpers::concatenate(
      Internal_block_distribution_pt, dist);
 
    // Build the distribution.
    if (is_subsidiary_block_preconditioner())
    {
      this->build_distribution(dist);
    }
    else
    {
      Internal_preconditioner_matrix_distribution_pt =
        new LinearAlgebraDistribution(dist);
    }
 
    Preconditioner_matrix_distribution_pt = new LinearAlgebraDistribution;
    LinearAlgebraDistributionHelpers::concatenate(
      Block_distribution_pt, *Preconditioner_matrix_distribution_pt);
 
    // Clear all distributions in Auxiliary_block_distribution_pt, except for
    // the one which corresponds to the preconditioner matrix distribution. This
    // is already deleted by clear_block_preconditioner_base(...)
 
    // Create the key which corresponds to
    // preconditioner_matrix_distribution_pt.
    {
      const unsigned nblocks = Block_distribution_pt.size();
      Vector<unsigned> preconditioner_matrix_key(nblocks, 0);
      for (unsigned i = 0; i < nblocks; i++)
      {
        preconditioner_matrix_key[i] = i;
      }
 
      // Now iterate through Auxiliary_block_distribution_pt and delete
      // everything except for the value which corresponds to
      // preconditioner_matrix_key.
      std::map<Vector<unsigned>, LinearAlgebraDistribution*>::iterator iter =
        Auxiliary_block_distribution_pt.begin();
      while (iter != Auxiliary_block_distribution_pt.end())
      {
        if (iter->first != preconditioner_matrix_key)
        {
          delete iter->second;
          iter++;
        }
        else
        {
          ++iter;
        }
      }
 
      // Clear it just to be safe!
      Auxiliary_block_distribution_pt.clear();
 
      // Insert the preconditioner matrix distribution.
      insert_auxiliary_block_distribution(
        preconditioner_matrix_key, Preconditioner_matrix_distribution_pt);
    } // End of Auxiliary_block_distribution_pt encapsulation.
 
    // Clearing up after comm to assemble sparse lookup schemes.
#ifdef OOMPH_HAS_MPI
    if (send_requests_sparse.size() > 0)
    {
      MPI_Waitall(send_requests_sparse.size(),
                  &send_requests_sparse[0],
                  MPI_STATUS_IGNORE);
    }
    if (recv_requests_sparse.size() > 0)
    {
      MPI_Waitall(recv_requests_sparse.size(),
                  &recv_requests_sparse[0],
                  MPI_STATUS_IGNORE);
    }
    for (unsigned p = 0; p < nproc; p++)
    {
      delete[] index_in_dof_block_sparse_send[p];
      delete[] dof_number_sparse_send[p];
    }
    delete[] index_in_dof_block_sparse_send;
    delete[] dof_number_sparse_send;
    delete[] nreq_sparse;
    delete[] nreq_sparse_for_proc;
#endif
 
    // Next we assemble the lookup schemes for the rows
    // if the matrix is not distributed then we assemble Global_index
    // if the matrix is distributed then Rows_to_send_..., Rows_to_recv_... etc.
    if (!distributed)
    {
      // Resize the storage.
      Global_index.resize(Internal_nblock_types);
      for (unsigned b = 0; b < Internal_nblock_types; b++)
      {
        Global_index[b].resize(Internal_block_distribution_pt[b]->nrow());
      }
 
      // Compute:
      unsigned nrow = this->master_nrow();
      for (unsigned i = 0; i < nrow; i++)
      {
        // the dof type number;
        int dof_number = this->internal_dof_number(i);
        if (dof_number >= 0)
        {
          // the block number;
          unsigned block_number = Dof_number_to_block_number_lookup[dof_number];
 
          // the index in the block.
          unsigned index_in_block = 0;
          unsigned ptr = 0;
          while (int(Block_number_to_dof_number_lookup[block_number][ptr]) !=
                 dof_number)
          {
            index_in_block += internal_dof_block_dimension(
              Block_number_to_dof_number_lookup[block_number][ptr]);
            ptr++;
          }
          index_in_block += internal_index_in_dof(i);
          Global_index[block_number][index_in_block] = i;
        }
      }
    }
    // otherwise the matrix is distributed
    else
    {
#ifdef OOMPH_HAS_MPI
 
      // the pointer to the master distribution
      const LinearAlgebraDistribution* master_distribution_pt =
        this->master_distribution_pt();
 
      // resize the nrows... storage
      Nrows_to_send_for_get_block.resize(Internal_nblock_types, nproc);
      Nrows_to_send_for_get_block.initialise(0);
      Nrows_to_send_for_get_ordered.resize(nproc);
      Nrows_to_send_for_get_ordered.initialise(0);
 
      // loop over my rows
      unsigned nrow_local = master_distribution_pt->nrow_local();
      unsigned first_row = master_distribution_pt->first_row();
      for (unsigned i = 0; i < nrow_local; i++)
      {
        // the block number
        int b = this->internal_block_number(first_row + i);
 
        // check that the DOF i is associated with this preconditioner
        if (b >= 0)
        {
          // the block index
          unsigned j = this->internal_index_in_block(first_row + i);
 
          // the processor this row will be sent to
          unsigned block_p = 0;
          while (!(Internal_block_distribution_pt[b]->first_row(block_p) <= j &&
                   (Internal_block_distribution_pt[b]->first_row(block_p) +
                      Internal_block_distribution_pt[b]->nrow_local(block_p) >
                    j)))
          {
            block_p++;
          }
 
          // and increment the counter
          Nrows_to_send_for_get_block(b, block_p)++;
          Nrows_to_send_for_get_ordered[block_p]++;
        }
      }
 
      // resize the storage for Nrows_to_recv
      Nrows_to_recv_for_get_block.resize(Internal_nblock_types, nproc);
      Nrows_to_recv_for_get_block.initialise(0);
      Nrows_to_recv_for_get_ordered.resize(nproc);
      Nrows_to_recv_for_get_ordered.initialise(0);
 
      // next we send the number of rows that will be sent by this processor
      Vector<unsigned*> nrows_to_send(nproc, 0);
      Vector<unsigned*> nrows_to_recv(nproc, 0);
      Vector<MPI_Request> send_requests_nrow;
      Vector<MPI_Request> recv_requests_nrow;
      Vector<unsigned> proc;
      for (unsigned p = 0; p < nproc; p++)
      {
        if (p != my_rank)
        {
          // send
          proc.push_back(p);
          nrows_to_send[p] = new unsigned[Internal_nblock_types];
          for (unsigned b = 0; b < Internal_nblock_types; b++)
          {
            nrows_to_send[p][b] = Nrows_to_send_for_get_block(b, p);
          }
          MPI_Request s_req;
          MPI_Isend(nrows_to_send[p],
                    Internal_nblock_types,
                    MPI_UNSIGNED,
                    p,
                    3,
                    comm_pt()->mpi_comm(),
                    &s_req);
          send_requests_nrow.push_back(s_req);
 
          // recv
          nrows_to_recv[p] = new unsigned[Internal_nblock_types];
          MPI_Request r_req;
          MPI_Irecv(nrows_to_recv[p],
                    Internal_nblock_types,
                    MPI_UNSIGNED,
                    p,
                    3,
                    comm_pt()->mpi_comm(),
                    &r_req);
          recv_requests_nrow.push_back(r_req);
        }
        // send to self
        else
        {
          for (unsigned b = 0; b < Internal_nblock_types; b++)
          {
            Nrows_to_recv_for_get_block(b, p) =
              Nrows_to_send_for_get_block(b, p);
          }
          Nrows_to_recv_for_get_ordered[p] = Nrows_to_send_for_get_ordered[p];
        }
      }
 
      // create some temporary storage for the global row indices that will
      // be received from another processor.
      DenseMatrix<int*> block_rows_to_send(Internal_nblock_types, nproc, 0);
      Vector<int*> ordered_rows_to_send(nproc, 0);
 
      // resize the rows... storage
      Rows_to_send_for_get_block.resize(Internal_nblock_types, nproc);
      Rows_to_send_for_get_block.initialise(0);
      Rows_to_send_for_get_ordered.resize(nproc);
      Rows_to_send_for_get_ordered.initialise(0);
      Rows_to_recv_for_get_block.resize(Internal_nblock_types, nproc);
      Rows_to_recv_for_get_block.initialise(0);
 
      // resize the storage
      for (unsigned p = 0; p < nproc; p++)
      {
        for (unsigned b = 0; b < Internal_nblock_types; b++)
        {
          Rows_to_send_for_get_block(b, p) =
            new int[Nrows_to_send_for_get_block(b, p)];
          if (p != my_rank)
          {
            block_rows_to_send(b, p) =
              new int[Nrows_to_send_for_get_block(b, p)];
          }
          else
          {
            Rows_to_recv_for_get_block(b, p) =
              new int[Nrows_to_send_for_get_block(b, p)];
          }
        }
        Rows_to_send_for_get_ordered[p] =
          new int[Nrows_to_send_for_get_ordered[p]];
      }
 
 
      // loop over my rows to allocate the nrows
      DenseMatrix<unsigned> ptr_block(Internal_nblock_types, nproc, 0);
      for (unsigned i = 0; i < nrow_local; i++)
      {
        // the block number
        int b = this->internal_block_number(first_row + i);
 
        // check that the DOF i is associated with this preconditioner
        if (b >= 0)
        {
          // the block index
          unsigned j = this->internal_index_in_block(first_row + i);
 
          // the processor this row will be sent to
          unsigned block_p = 0;
          while (!(Internal_block_distribution_pt[b]->first_row(block_p) <= j &&
                   (Internal_block_distribution_pt[b]->first_row(block_p) +
                      Internal_block_distribution_pt[b]->nrow_local(block_p) >
                    j)))
          {
            block_p++;
          }
 
          // and store the row
          Rows_to_send_for_get_block(b, block_p)[ptr_block(b, block_p)] = i;
          if (block_p != my_rank)
          {
            block_rows_to_send(b, block_p)[ptr_block(b, block_p)] =
              j - Internal_block_distribution_pt[b]->first_row(block_p);
          }
          else
          {
            Rows_to_recv_for_get_block(b, block_p)[ptr_block(b, block_p)] =
              j - Internal_block_distribution_pt[b]->first_row(block_p);
          }
          ptr_block(b, block_p)++;
        }
      }
 
      // next block ordered
      for (unsigned p = 0; p < nproc; ++p)
      {
        int pt = 0;
        for (unsigned b = 0; b < Internal_nblock_types; ++b)
        {
          for (unsigned i = 0; i < Nrows_to_send_for_get_block(b, p); ++i)
          {
            Rows_to_send_for_get_ordered[p][pt] =
              Rows_to_send_for_get_block(b, p)[i];
            pt++;
          }
        }
      }
 
      // next process the nrow recvs as they complete
 
      // recv and store the data
      unsigned c = recv_requests_nrow.size();
      while (c > 0)
      {
        // wait for any communication to finish
        int req_number;
        MPI_Waitany(c, &recv_requests_nrow[0], &req_number, MPI_STATUS_IGNORE);
        recv_requests_nrow.erase(recv_requests_nrow.begin() + req_number);
        c--;
 
        // determine the source processor
        unsigned p = proc[req_number];
        proc.erase(proc.begin() + req_number);
 
        // copy the data to its final storage
        Nrows_to_recv_for_get_ordered[p] = 0;
        for (unsigned b = 0; b < Internal_nblock_types; b++)
        {
          Nrows_to_recv_for_get_block(b, p) = nrows_to_recv[p][b];
          Nrows_to_recv_for_get_ordered[p] += nrows_to_recv[p][b];
        }
 
        // and clear
        delete[] nrows_to_recv[p];
      }
 
      // resize the storage for the incoming rows data
      Rows_to_recv_for_get_ordered.resize(nproc, 0);
      for (unsigned p = 0; p < nproc; p++)
      {
        if (p != my_rank)
        {
          for (unsigned b = 0; b < Internal_nblock_types; b++)
          {
            Rows_to_recv_for_get_block(b, p) =
              new int[Nrows_to_recv_for_get_block(b, p)];
          }
        }
      }
 
      // compute the number of sends and recv from this processor
      // to each other processor
      Vector<unsigned> nsend_for_rows(nproc, 0);
      Vector<unsigned> nrecv_for_rows(nproc, 0);
      for (unsigned p = 0; p < nproc; p++)
      {
        if (p != my_rank)
        {
          for (unsigned b = 0; b < Internal_nblock_types; b++)
          {
            if (Nrows_to_send_for_get_block(b, p) > 0)
            {
              nsend_for_rows[p]++;
            }
            if (Nrows_to_recv_for_get_block(b, p) > 0)
            {
              nrecv_for_rows[p]++;
            }
          }
        }
      }
 
      // finally post the sends and recvs
      MPI_Aint base_displacement;
      MPI_Get_address(matrix_pt(), &base_displacement);
      Vector<MPI_Request> req_rows;
      for (unsigned p = 0; p < nproc; p++)
      {
        if (p != my_rank)
        {
          // send
          if (nsend_for_rows[p] > 0)
          {
            MPI_Datatype send_types[nsend_for_rows[p]];
            MPI_Aint send_displacements[nsend_for_rows[p]];
            int send_sz[nsend_for_rows[p]];
            unsigned send_ptr = 0;
            for (unsigned b = 0; b < Internal_nblock_types; b++)
            {
              if (Nrows_to_send_for_get_block(b, p) > 0)
              {
                MPI_Type_contiguous(Nrows_to_send_for_get_block(b, p),
                                    MPI_INT,
                                    &send_types[send_ptr]);
                MPI_Type_commit(&send_types[send_ptr]);
                MPI_Get_address(block_rows_to_send(b, p),
                                &send_displacements[send_ptr]);
                send_displacements[send_ptr] -= base_displacement;
                send_sz[send_ptr] = 1;
                send_ptr++;
              }
            }
            MPI_Datatype final_send_type;
            MPI_Type_create_struct(nsend_for_rows[p],
                                   send_sz,
                                   send_displacements,
                                   send_types,
                                   &final_send_type);
            MPI_Type_commit(&final_send_type);
            for (unsigned i = 0; i < nsend_for_rows[p]; i++)
            {
              MPI_Type_free(&send_types[i]);
            }
            MPI_Request send_req;
            MPI_Isend(matrix_pt(),
                      1,
                      final_send_type,
                      p,
                      4,
                      comm_pt()->mpi_comm(),
                      &send_req);
            req_rows.push_back(send_req);
            MPI_Type_free(&final_send_type);
          }
 
          // recv
          if (nrecv_for_rows[p] > 0)
          {
            MPI_Datatype recv_types[nrecv_for_rows[p]];
            MPI_Aint recv_displacements[nrecv_for_rows[p]];
            int recv_sz[nrecv_for_rows[p]];
            unsigned recv_ptr = 0;
            for (unsigned b = 0; b < Internal_nblock_types; b++)
            {
              if (Nrows_to_recv_for_get_block(b, p) > 0)
              {
                MPI_Type_contiguous(Nrows_to_recv_for_get_block(b, p),
                                    MPI_INT,
                                    &recv_types[recv_ptr]);
                MPI_Type_commit(&recv_types[recv_ptr]);
                MPI_Get_address(Rows_to_recv_for_get_block(b, p),
                                &recv_displacements[recv_ptr]);
                recv_displacements[recv_ptr] -= base_displacement;
                recv_sz[recv_ptr] = 1;
                recv_ptr++;
              }
            }
            MPI_Datatype final_recv_type;
            MPI_Type_create_struct(nrecv_for_rows[p],
                                   recv_sz,
                                   recv_displacements,
                                   recv_types,
                                   &final_recv_type);
            MPI_Type_commit(&final_recv_type);
            for (unsigned i = 0; i < nrecv_for_rows[p]; i++)
            {
              MPI_Type_free(&recv_types[i]);
            }
            MPI_Request recv_req;
            MPI_Irecv(matrix_pt(),
                      1,
                      final_recv_type,
                      p,
                      4,
                      comm_pt()->mpi_comm(),
                      &recv_req);
            req_rows.push_back(recv_req);
            MPI_Type_free(&final_recv_type);
          }
        }
      }
 
      // cleaning up Waitalls
 
 
      // wait for the recv requests so we can compute
      // Nrows_to_recv_for_get_ordered
      unsigned n_req_rows = req_rows.size();
      if (n_req_rows)
      {
        MPI_Waitall(n_req_rows, &req_rows[0], MPI_STATUS_IGNORE);
      }
 
      // resize the storage
      Rows_to_recv_for_get_ordered.resize(nproc);
      Rows_to_recv_for_get_ordered.initialise(0);
 
      // construct block offset
      Vector<int> vec_offset(Internal_nblock_types, 0);
      for (unsigned b = 1; b < Internal_nblock_types; ++b)
      {
        vec_offset[b] = vec_offset[b - 1] +
                        Internal_block_distribution_pt[b - 1]->nrow_local();
      }
 
      //
      for (unsigned p = 0; p < nproc; p++)
      {
        int pt = 0;
        Rows_to_recv_for_get_ordered[p] =
          new int[Nrows_to_recv_for_get_ordered[p]];
        for (unsigned b = 0; b < Internal_nblock_types; b++)
        {
          for (unsigned i = 0; i < Nrows_to_recv_for_get_block(b, p); i++)
          {
            Rows_to_recv_for_get_ordered[p][pt] =
              Rows_to_recv_for_get_block(b, p)[i] + vec_offset[b];
            pt++;
          }
        }
      }
 
      // clean up
      for (unsigned p = 0; p < nproc; p++)
      {
        if (p != my_rank)
        {
          for (unsigned b = 0; b < Internal_nblock_types; b++)
          {
            delete[] block_rows_to_send(b, p);
          }
          if (Nrows_to_send_for_get_ordered[p] > 0)
          {
            delete[] ordered_rows_to_send[p];
          }
        }
      }
 
      // and the send reqs
      unsigned n_req_send_nrow = send_requests_nrow.size();
      if (n_req_send_nrow)
      {
        MPI_Waitall(n_req_send_nrow, &send_requests_nrow[0], MPI_STATUS_IGNORE);
      }
      for (unsigned p = 0; p < nproc; p++)
      {
        delete[] nrows_to_send[p];
      }
#endif
    }
 
    // If we asked for output of blocks to a file then do it.
    if (block_output_on()) output_blocks_to_files(Output_base_filename);
  }
 
  //============================================================================
  //??ds
  /// Function to turn this preconditioner into a
  /// subsidiary preconditioner that operates within a bigger
  /// "master block preconditioner (e.g. a Navier-Stokes 2x2 block
  /// preconditioner dealing with the fluid sub-blocks within a
  /// 3x3 FSI preconditioner. Once this is done the master block
  /// preconditioner deals with the block setup etc.
  /// The vector block_map must specify the dof number in the
  /// master preconditioner that corresponds to a block number in this
  /// preconditioner. ??ds horribly misleading comment!
  /// The length of the vector is used to determine the number of
  /// blocks in this preconditioner therefore it must be correctly sized.
  /// This calls the other turn_into_subsidiary_block_preconditioner(...)
  /// function providing an empty doftype_to_doftype_map vector.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::turn_into_subsidiary_block_preconditioner(
    BlockPreconditioner<MATRIX>* master_block_prec_pt,
    const Vector<unsigned>& doftype_in_master_preconditioner_coarse)
  {
    // Create the identity dof_coarsen_map
    Vector<Vector<unsigned>> doftype_coarsen_map_coarse;
    unsigned doftype_in_master_preconditioner_coarse_size =
      doftype_in_master_preconditioner_coarse.size();
 
    for (unsigned dof_i = 0;
         dof_i < doftype_in_master_preconditioner_coarse_size;
         dof_i++)
    {
      // Create a vector of size 1 and value i,
      // then push it into the dof_coarsen_map vector.
      Vector<unsigned> tmp_vec(1, dof_i);
      doftype_coarsen_map_coarse.push_back(tmp_vec);
    }
 
    // Call the other turn_into_subsidiary_block_preconditioner function.
    turn_into_subsidiary_block_preconditioner(
      master_block_prec_pt,
      doftype_in_master_preconditioner_coarse,
      doftype_coarsen_map_coarse);
  }
 
 
  //============================================================================
  /// Function to turn this block preconditioner into a
  /// subsidiary block preconditioner that operates within a bigger
  /// master block preconditioner (e.g. a Navier-Stokes 2x2 block
  /// preconditioner dealing with the fluid sub-blocks within a
  /// 3x3 FSI preconditioner. Once this is done the master block
  /// preconditioner deals with the block setup etc.
  ///
  /// The vector doftype_map must specify the dof type in the
  /// master preconditioner that corresponds to a dof type in this block
  /// preconditioner.
  ///
  /// In general, we want:
  /// doftype_map[doftype in subsidiary prec] = doftype in master prec.
  ///
  /// It tells this block preconditioner which dof types of the master
  /// block preconditioner it is working with.
  ///
  /// The length of the vector is used to determine the number of
  /// dof types in THIS block preconditioner therefore it must be correctly
  /// sized.
  ///
  /// For example, let the master block preconditioner have 5 dof types in total
  /// and a 1-4 dof type splitting where the block (0,0) corresponds to
  /// dof type 0 and the block (1,1) corresponds to dof types 1, 2, 3 and 4
  /// (i.e. it would have given to block_setup the vector [0,1,1,1,1]).
  /// Furthermore, it solves (1,1) block with subsidiary block preconditioner.
  /// Then the doftype_map passed to this function of the subsidiary block
  /// preconditioner would be [1, 2, 3, 4].
  ///
  /// Dof type coarsening (following on from the example above):
  /// Let the subsidiary block preconditioner (THIS block preconditioner)
  /// only works with two DOF types, then the master block preconditioner must
  /// "coarsen" the dof types by providing the optional argument
  /// doftype_coarsen_map vector.
  ///
  /// The doftype_coarsen_map vector (in this case) might be [[0,1], [2,3]]
  /// telling the subsidiary block preconditioner that the SUBSIDIARY dof types
  /// 0 and 1 should be treated as dof type 0 and the subsidiary dof types 2
  /// and 3 should be treated as subsidiary dof type 1.
  ///
  /// If no doftype_coarsen_map vector is provided, then the identity is
  /// used automatically (see the turn_into_subsidiary_block_preconditioner(...)
  /// function with only two arguments). In the above case, the identity
  /// doftype_coarsen_map vector for the subsidiary block preconditioner
  /// would be the 2D vector [[0], [1], [2], [3]] which means
  /// dof type 0 is treated as dof type 0,
  /// dof type 1 is treated as dof type 1,
  /// dof type 2 is treated as dof type 2, and
  /// dof type 3 is treated as dof type 3.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::turn_into_subsidiary_block_preconditioner(
    BlockPreconditioner<MATRIX>* master_block_prec_pt,
    const Vector<unsigned>& doftype_in_master_preconditioner_coarse,
    const Vector<Vector<unsigned>>& doftype_coarsen_map_coarse)
  {
    // Set the master block preconditioner pointer
    Master_block_preconditioner_pt = master_block_prec_pt;
 
    // Set the Doftype_coarsen_map_coarse.
    Doftype_coarsen_map_coarse = doftype_coarsen_map_coarse;
 
    Doftype_in_master_preconditioner_coarse =
      doftype_in_master_preconditioner_coarse;
  } // end of turn_into_subsidiary_block_preconditioner(...)
 
 
  //============================================================================
  /// Determine the size of the matrix blocks and setup the
  /// lookup schemes relating the global degrees of freedom with
  /// their "blocks" and their indices (row/column numbers) in those
  /// blocks.
  /// The distributions of the preconditioner and the blocks are
  /// automatically specified (and assumed to be uniform) at this
  /// stage.
  /// This method should be used if each DOF type corresponds to a
  /// unique block type.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::block_setup()
  {
#ifdef PARANOID
 
    // Subsidiary preconditioners don't really need the meshes
    if (this->is_master_block_preconditioner())
    {
      std::ostringstream err_msg;
      unsigned n = nmesh();
      if (n == 0)
      {
        err_msg << "No meshes have been set for this block preconditioner!\n"
                << "Set one with set_nmesh(...), set_mesh(...)" << std::endl;
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
        for (unsigned m = 0; m < n; m++)
        {
          if (Mesh_pt[m] == 0)
          {
            err_msg << "The mesh pointer to mesh " << m << " is null!\n"
                    << "Set a non-null one with set_mesh(...)" << std::endl;
            throw OomphLibError(
              err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
          }
        }
      }
    }
#endif
 
    // Get the number of dof types.
    unsigned internal_n_dof_types = ndof_types();
 
    // Build the dof to block map - assume that each type of dof corresponds
    // to a different type of block.
    Vector<unsigned> dof_to_block_lookup(internal_n_dof_types);
    for (unsigned i = 0; i < internal_n_dof_types; i++)
    {
      dof_to_block_lookup[i] = i;
    }
 
    // call the block setup method
    this->block_setup(dof_to_block_lookup);
  }
 
 
  //============================================================================
  /// Get the block matrices required for the block preconditioner. Takes a
  /// pointer to a matrix of bools that indicate if a specified sub-block is
  /// required for the preconditioning operation. Computes the required block
  /// matrices, and stores pointers to them in the matrix block_matrix_pt. If an
  /// entry in block_matrix_pt is equal to NULL that sub-block has not been
  /// requested and is therefore not available.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::get_blocks(
    DenseMatrix<bool>& required_blocks,
    DenseMatrix<MATRIX*>& block_matrix_pt) const
  {
    // Cache number of block types
    const unsigned n_block_types = nblock_types();
 
#ifdef PARANOID
    // If required blocks matrix pointer is not the correct size then abort.
    if ((required_blocks.nrow() != n_block_types) ||
        (required_blocks.ncol() != n_block_types))
    {
      std::ostringstream error_message;
      error_message << "The size of the matrix of bools required_blocks "
                    << "(which indicates which blocks are required) is not the "
                    << "right size, required_blocks is "
                    << required_blocks.ncol() << " x " << required_blocks.nrow()
                    << ", whereas it should "
                    << "be " << n_block_types << " x " << n_block_types;
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // If block matrix pointer is not the correct size then abort.
    if ((block_matrix_pt.nrow() != n_block_types) ||
        (block_matrix_pt.ncol() != n_block_types))
    {
      std::ostringstream error_message;
      error_message << "The size of the block matrix pt is not the "
                    << "right size, block_matrix_pt is "
                    << block_matrix_pt.ncol() << " x " << block_matrix_pt.nrow()
                    << ", whereas it should "
                    << "be " << n_block_types << " x " << n_block_types;
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
#endif
 
    // Loop over the blocks
    for (unsigned i = 0; i < n_block_types; i++)
    {
      for (unsigned j = 0; j < n_block_types; j++)
      {
        // If block(i,j) is required then create a matrix and fill it in.
        if (required_blocks(i, j))
        {
          //??ds might want to remove this use of new as well?
          block_matrix_pt(i, j) = new MATRIX;
          get_block(i, j, *block_matrix_pt(i, j));
        }
 
        // Otherwise set pointer to null.
        else
        {
          block_matrix_pt(i, j) = 0;
        }
      }
    }
  }
 
  //============================================================================
  /// Takes the naturally ordered vector and extracts the blocks
  /// indicated by the block number (the values) in the Vector
  /// block_vec_number all at once, then concatenates them without
  /// communication. Here, the values in block_vec_number is the block number
  /// in the current preconditioner.
  /// This is a non-const function because distributions may be created
  /// and stored in Auxiliary_block_distribution_pt for future use.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::get_concatenated_block_vector(
    const Vector<unsigned>& block_vec_number,
    const DoubleVector& v,
    DoubleVector& w)
  {
#ifdef PARANOID
 
    // Check if v is built.
    if (!v.built())
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // v must have the same distribution as the upper-most master block
    // preconditioner, since the upper-most master block preconditioner
    // should have the same distribution as the matrix pointed to
    // by matrix_pt().
    if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the global vector v must match the "
              << " specified master_distribution_pt(). \n"
              << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // Check to see if there are more blocks defined in the block_vec_number
    // vector than the number of block types. This is not allowed.
    const unsigned para_nblock_types = nblock_types();
    const unsigned para_block_vec_number_size = block_vec_number.size();
    if (para_block_vec_number_size > para_nblock_types)
    {
      std::ostringstream err_msg;
      err_msg << "You have requested " << para_block_vec_number_size
              << " number of blocks, (block_vec_number.size() is "
              << para_block_vec_number_size << ").\n"
              << "But there are only " << para_nblock_types
              << " nblock_types.\n"
              << "Please make sure that block_vec_number is correctly sized.\n";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // Check if any block numbers defined in block_vec_number is equal to or
    // greater than the number of block types.
    // E.g. if there are 5 block types, we can only have block numbers:
    //  0, 1, 2, 3 and 4.
    for (unsigned i = 0; i < para_block_vec_number_size; i++)
    {
      const unsigned para_required_block = block_vec_number[i];
      if (para_required_block >= para_nblock_types)
      {
        std::ostringstream err_msg;
        err_msg << "block_vec_number[" << i << "] is " << para_required_block
                << ".\n"
                << "But there are only " << para_nblock_types
                << " nblock_types.\n";
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
    }
 
    // Check that no block number is inserted twice.
    std::set<unsigned> para_set;
    for (unsigned b = 0; b < para_block_vec_number_size; b++)
    {
      std::pair<std::set<unsigned>::iterator, bool> para_set_ret;
      para_set_ret = para_set.insert(block_vec_number[b]);
 
      if (!para_set_ret.second)
      {
        std::ostringstream err_msg;
        err_msg << "Error: the block number " << block_vec_number[b]
                << " appears twice.\n";
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // Number of blocks to get.
    const unsigned n_block = block_vec_number.size();
 
    // Each block is made of dof types. We get the most fine grain dof types.
    // Most fine grain in the sense that these are the dof types that belongs
    // in this block before any coarsening of dof types has taken place.
    // The ordering of the dof types matters, this is handled properly when
    // creating the Block_to_dof_map_fine vector and must be respected here.
    // I.e. we cannot arbitrarily insert dof types (even if they are correct)
    // in the vector most_fine_grain_dof.
    Vector<unsigned> most_fine_grain_dof;
    for (unsigned b = 0; b < n_block; b++)
    {
      const unsigned mapped_b = block_vec_number[b];
      most_fine_grain_dof.insert(most_fine_grain_dof.end(),
                                 Block_to_dof_map_fine[mapped_b].begin(),
                                 Block_to_dof_map_fine[mapped_b].end());
    }
 
    // Get all the dof level vectors in one go.
    Vector<DoubleVector> dof_block_vector;
    internal_get_block_vectors(most_fine_grain_dof, v, dof_block_vector);
 
    // Next we need to build the output DoubleVector w with the correct
    // distribution: the concatenation of the distributions of all the
    // dof-level vectors. This is the same as the concatenation of the
    // distributions of the blocks within this preconditioner.
    //
    // So we first check if it exists already, if not, we create it and
    // store it for future use. We store it because concatenation of
    // distributions requires communication, so concatenation of
    // distributions on-the-fly should be avoided.
    std::map<Vector<unsigned>, LinearAlgebraDistribution*>::const_iterator iter;
 
    // Attempt to get an iterator pointing to the pair with the value
    // block_vec_number.
    iter = Auxiliary_block_distribution_pt.find(block_vec_number);
 
    if (iter != Auxiliary_block_distribution_pt.end())
    // If it exists, build w with the distribution pointed to
    // by pair::second.
    {
      w.build(iter->second);
    }
    else
    // Else, we need to create the distribution and store it in
    // Auxiliary_block_distribution_pt.
    {
      Vector<LinearAlgebraDistribution*> tmp_vec_dist_pt(n_block, 0);
      for (unsigned b = 0; b < n_block; b++)
      {
        tmp_vec_dist_pt[b] = Block_distribution_pt[block_vec_number[b]];
      }
 
      // Note that the distribution is created with new but not deleted here.
      // This is handled in the clean up functions.
      LinearAlgebraDistribution* tmp_dist_pt = new LinearAlgebraDistribution;
      LinearAlgebraDistributionHelpers::concatenate(tmp_vec_dist_pt,
                                                    *tmp_dist_pt);
 
      // Store the pair of Vector<unsigned> and LinearAlgebraDistribution*
      insert_auxiliary_block_distribution(block_vec_number, tmp_dist_pt);
 
      // Build w.
      w.build(tmp_dist_pt);
    }
 
    // Now concatenate all the dof level vectors into the vector w.
    DoubleVectorHelpers::concatenate_without_communication(dof_block_vector, w);
 
  } // get_concatenated_block_vector(...)
 
  //============================================================================
  /// Takes concatenated block ordered vector, b, and copies its
  // entries to the appropriate entries in the naturally ordered vector, v.
  // Here the values in block_vec_number indicates which blocks the vector
  // b is a concatenation of. The block number are those in the current
  // preconditioner. If the preconditioner is a subsidiary block
  // preconditioner the other entries in v that are not associated with it
  // are left alone.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::return_concatenated_block_vector(
    const Vector<unsigned>& block_vec_number,
    const DoubleVector& w,
    DoubleVector& v) const
  {
#ifdef PARANOID
 
    // Check if v is built.
    if (!v.built())
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // v must have the same distribution as the upper-most master block
    // preconditioner, since the upper-most master block preconditioner
    // should have the same distribution as the matrix pointed to
    // by matrix_pt().
    if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the global vector v must match the "
              << " specified master_distribution_pt(). \n"
              << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // Check to see if there are more blocks defined in the block_vec_number
    // vector than the number of block types. This is not allowed.
    const unsigned para_block_vec_number_size = block_vec_number.size();
    const unsigned para_n_block = nblock_types();
    if (para_block_vec_number_size > para_n_block)
    {
      std::ostringstream err_msg;
      err_msg << "Trying to return " << para_block_vec_number_size
              << " block vectors.\n"
              << "But there are only " << para_n_block << " block types.\n";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // Check if any block numbers defined in block_vec_number is equal to or
    // greater than the number of block types.
    // E.g. if there are 5 block types, we can only have block numbers:
    //  0, 1, 2, 3 and 4.
    for (unsigned b = 0; b < para_block_vec_number_size; b++)
    {
      const unsigned para_required_block = block_vec_number[b];
      if (para_required_block > para_n_block)
      {
        std::ostringstream err_msg;
        err_msg << "block_vec_number[" << b << "] is " << para_required_block
                << ".\n"
                << "But there are only " << para_n_block << " block types.\n";
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
    }
 
    // Check that no block number is inserted twice.
    std::set<unsigned> para_set;
    for (unsigned b = 0; b < para_block_vec_number_size; b++)
    {
      std::pair<std::set<unsigned>::iterator, bool> para_set_ret;
      para_set_ret = para_set.insert(block_vec_number[b]);
 
      if (!para_set_ret.second)
      {
        std::ostringstream err_msg;
        err_msg << "Error: the block number " << block_vec_number[b]
                << " appears twice.\n";
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
    }
 
    // Check that w is built.
    if (!w.built())
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the block vector w must be setup.";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // Check that the distributions defined by block_vec_number is correct for
    // the distribution from w.
    // Recall that w is the concatenation of the block vectors defined by
    // the values in block_vec_number. We check that this is the case.
    Vector<LinearAlgebraDistribution*> para_vec_dist_pt(
      para_block_vec_number_size, 0);
 
    for (unsigned b = 0; b < para_block_vec_number_size; b++)
    {
      para_vec_dist_pt[b] = Block_distribution_pt[block_vec_number[b]];
    }
 
    LinearAlgebraDistribution para_tmp_dist;
 
    LinearAlgebraDistributionHelpers::concatenate(para_vec_dist_pt,
                                                  para_tmp_dist);
 
    if (*w.distribution_pt() != para_tmp_dist)
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the block vector w does not match \n"
              << "the concatenation of the block distributions defined in \n"
              << "block_vec_number.\n";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // Number of blocks to return.
    const unsigned n_block = block_vec_number.size();
 
    // Each block is made of dof types. We get the most fine grain dof types.
    // Most fine grain in the sense that these are the dof types that belongs
    // in this block before any coarsening of dof types has taken place.
    // The ordering of the dof types matters, this is handled properly when
    // creating the Block_to_dof_map_fine vector and must be respected here.
    // I.e. we cannot arbitrarily insert dof types (even if they are correct)
    // in the vector most_fine_grain_dof.
    Vector<unsigned> most_fine_grain_dof;
    for (unsigned b = 0; b < n_block; b++)
    {
      const unsigned mapped_b = block_vec_number[b];
      most_fine_grain_dof.insert(most_fine_grain_dof.end(),
                                 Block_to_dof_map_fine[mapped_b].begin(),
                                 Block_to_dof_map_fine[mapped_b].end());
    }
 
    // The number of most fine grain dof types associated with the blocks
    // defined by block_vec_number.
    const unsigned ndof = most_fine_grain_dof.size();
 
    // Build each dof level vector with the correct distribution.
    Vector<DoubleVector> dof_vector(ndof);
    for (unsigned d = 0; d < ndof; d++)
    {
      dof_vector[d].build(
        internal_block_distribution_pt(most_fine_grain_dof[d]));
    }
 
    // Perform the splitting of w into the most fine grain dof level vectors.
    DoubleVectorHelpers::split_without_communication(w, dof_vector);
 
    // Return all the dof level vectors in one go.
    internal_return_block_vectors(most_fine_grain_dof, dof_vector, v);
  } // return_concatenated_block_vector(...)
 
  //============================================================================
  /// Takes the naturally ordered vector and rearranges it into a
  /// vector of sub vectors corresponding to the blocks, so s[b][i] contains
  /// the i-th entry in the vector associated with block b.
  /// Note: If the preconditioner is a subsidiary preconditioner then only the
  /// sub-vectors associated with the blocks of the subsidiary preconditioner
  /// will be included. Hence the length of v is master_nrow() whereas the
  /// total length of the s vectors is the sum of the lengths of the
  /// individual block vectors defined in block_vec_number.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::get_block_vectors(
    const Vector<unsigned>& block_vec_number,
    const DoubleVector& v,
    Vector<DoubleVector>& s) const
  {
#ifdef PARANOID
 
    // Check if v is built.
    if (!v.built())
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // v must have the same distribution as the upper-most master block
    // preconditioner, since the upper-most master block preconditioner
    // should have the same distribution as the matrix pointed to
    // by matrix_pt().
    if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the global vector v must match the "
              << " specified master_distribution_pt(). \n"
              << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // Check to see if there are more blocks defined in the block_vec_number
    // vector than the number of block types. This is not allowed.
    const unsigned para_nblock_types = nblock_types();
    const unsigned para_block_vec_number_size = block_vec_number.size();
    if (para_block_vec_number_size > para_nblock_types)
    {
      std::ostringstream err_msg;
      err_msg << "You have requested " << para_block_vec_number_size
              << " number of blocks, (block_vec_number.size() is "
              << para_block_vec_number_size << ").\n"
              << "But there are only " << para_nblock_types
              << " nblock_types.\n"
              << "Please make sure that block_vec_number is correctly sized.\n";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // Check if any block numbers defined in block_vec_number is equal to or
    // greater than the number of block types.
    // E.g. if there are 5 block types, we can only have block numbers:
    //  0, 1, 2, 3 and 4.
    for (unsigned i = 0; i < para_block_vec_number_size; i++)
    {
      const unsigned para_required_block = block_vec_number[i];
      if (para_required_block > para_nblock_types)
      {
        std::ostringstream err_msg;
        err_msg << "block_vec_number[" << i << "] is " << para_required_block
                << ".\n"
                << "But there are only " << para_nblock_types
                << " nblock_types.\n";
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
    }
    // Check that no block number is inserted twice.
    std::set<unsigned> para_set;
    for (unsigned b = 0; b < para_block_vec_number_size; b++)
    {
      std::pair<std::set<unsigned>::iterator, bool> para_set_ret;
      para_set_ret = para_set.insert(block_vec_number[b]);
 
      if (!para_set_ret.second)
      {
        std::ostringstream err_msg;
        err_msg << "Error: the block number " << block_vec_number[b]
                << " appears twice.\n";
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // Number of blocks to get.
    const unsigned n_block = block_vec_number.size();
    s.resize(n_block);
 
    // Each block is made of dof types. We get the most fine grain dof types.
    // Most fine grain in the sense that these are the dof types that belongs
    // in this block before any coarsening of dof types has taken place.
    // The ordering of the dof types matters, this is handled properly when
    // creating the Block_to_dof_map_fine vector and must be respected here.
    // I.e. we cannot arbitrarily insert dof types (even if they are correct)
    // in the vector most_fine_grain_dof.
    Vector<unsigned> most_fine_grain_dof;
    for (unsigned b = 0; b < n_block; b++)
    {
      const unsigned mapped_b = block_vec_number[b];
 
      most_fine_grain_dof.insert(most_fine_grain_dof.end(),
                                 Block_to_dof_map_fine[mapped_b].begin(),
                                 Block_to_dof_map_fine[mapped_b].end());
    }
 
    // Get all the dof level vectors in one go.
    Vector<DoubleVector> dof_vector;
    internal_get_block_vectors(most_fine_grain_dof, v, dof_vector);
 
    // For each block vector requested,
    // build the block s[b],
    // concatenate the corresponding dof vector
 
    // Since all the dof vectors are in dof_vector,
    // we need to loop through this.
    // The offset helps us loop through this.
    unsigned offset = 0;
 
    for (unsigned b = 0; b < n_block; b++)
    {
      // The actual block number required.
      const unsigned mapped_b = block_vec_number[b];
 
      // How many most fine grain dofs are in this block?
      const unsigned n_dof = Block_to_dof_map_fine[mapped_b].size();
 
      if (n_dof == 1)
      // No need to concatenate, just copy the DoubleVector.
      {
        s[b] = dof_vector[offset];
      }
      else
      // Concatenate the relevant dof vectors into s[b].
      {
        s[b].build(Block_distribution_pt[mapped_b], 0);
        Vector<DoubleVector*> tmp_vec_pt(n_dof, 0);
        for (unsigned vec_i = 0; vec_i < n_dof; vec_i++)
        {
          tmp_vec_pt[vec_i] = &dof_vector[offset + vec_i];
        }
 
        DoubleVectorHelpers::concatenate_without_communication(tmp_vec_pt,
                                                               s[b]);
      }
 
      // Update the offset.
      offset += n_dof;
    }
  } // get_block_vectors(...)
 
 
  //============================================================================
  /// Takes the naturally ordered vector and rearranges it into a
  /// vector of sub vectors corresponding to the blocks, so s[b][i] contains
  /// the i-th entry in the vector associated with block b.
  /// Note: If the preconditioner is a subsidiary preconditioner then only the
  /// sub-vectors associated with the blocks of the subsidiary preconditioner
  /// will be included. Hence the length of v is master_nrow() whereas the
  /// total length of the s vectors is Nrow.
  /// This is simply a wrapper around the other get_block_vectors(...) function
  /// where the block_vec_number Vector is the identity, i.e.
  /// block_vec_number is [0, 1, ..., nblock_types - 1].
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::get_block_vectors(
    const DoubleVector& v, Vector<DoubleVector>& s) const
  {
    // Get the number of blocks in this block preconditioner.
    const unsigned n_block = nblock_types();
 
    // Create the identity vector.
    Vector<unsigned> required_block(n_block, 0);
    for (unsigned i = 0; i < n_block; i++)
    {
      required_block[i] = i;
    }
 
    // Call the other function which does the work.
    get_block_vectors(required_block, v, s);
  }
 
  //============================================================================
  /// Takes the naturally ordered vector and
  /// rearranges it into a vector of sub vectors corresponding to the blocks,
  /// so s[b][i] contains the i-th entry in the vector associated with block b.
  /// The block_vec_number indicates which blocks we want.
  /// These blocks and vectors are those corresponding to the internal blocks.
  /// Note: If the preconditioner is a subsidiary preconditioner then only the
  /// sub-vectors associated with the blocks of the subsidiary preconditioner
  /// will be included. Hence the length of v is master_nrow() whereas the
  /// total length of the s vectors is the sum of the Nrow of the sub vectors.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::internal_get_block_vectors(
    const Vector<unsigned>& block_vec_number,
    const DoubleVector& v,
    Vector<DoubleVector>& s) const
  {
#ifdef PARANOID
    if (!v.built())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must match the "
                    << " specified master_distribution_pt(). \n"
                    << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // Number of block types
    // const unsigned nblock = this->internal_nblock_types();
    const unsigned nblock = block_vec_number.size();
 
    // if + only one processor
    //    + more than one processor but matrix_pt is not distributed
    // then use the serial get_block method
    if (this->distribution_pt()->communicator_pt()->nproc() == 1 ||
        !this->distribution_pt()->distributed())
    {
      // Vector of vectors for each section of residual vector
      s.resize(nblock);
 
      // pointer to the data in v
      const double* v_pt = v.values_pt();
 
      // setup the block vector and then insert the data
      for (unsigned b = 0; b < nblock; b++)
      {
        const unsigned required_block = block_vec_number[b];
        s[b].build(Internal_block_distribution_pt[required_block], 0.0);
        double* s_pt = s[b].values_pt();
        unsigned nrow = s[b].nrow();
        for (unsigned i = 0; i < nrow; i++)
        {
          s_pt[i] = v_pt[this->Global_index[required_block][i]];
        }
      }
    }
    // otherwise use mpi
    else
    {
#ifdef OOMPH_HAS_MPI
      // my rank
      unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
 
      // the number of processors
      unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
 
      // build the vectors
      s.resize(nblock);
      for (unsigned b = 0; b < nblock; b++)
      {
        const unsigned required_block = block_vec_number[b];
        s[b].build(Internal_block_distribution_pt[required_block], 0.0);
      }
 
      // determine the maximum number of rows to be sent or recv
      // and determine the number of blocks each processor will send and recv
      // communication for
      Vector<int> nblock_send(nproc, 0);
      Vector<int> nblock_recv(nproc, 0);
      unsigned max_n_send_or_recv = 0;
      for (unsigned p = 0; p < nproc; p++)
      {
        for (unsigned b = 0; b < nblock; b++)
        {
          const unsigned required_block = block_vec_number[b];
          max_n_send_or_recv = std::max(
            max_n_send_or_recv, Nrows_to_send_for_get_block(required_block, p));
          max_n_send_or_recv = std::max(
            max_n_send_or_recv, Nrows_to_recv_for_get_block(required_block, p));
          if (Nrows_to_send_for_get_block(required_block, p) > 0)
          {
            nblock_send[p]++;
          }
          if (Nrows_to_recv_for_get_block(required_block, p) > 0)
          {
            nblock_recv[p]++;
          }
        }
      }
 
      // create a vectors of 1s the size of the nblock for the mpi indexed
      // data types
      int* block_lengths = new int[max_n_send_or_recv];
      for (unsigned i = 0; i < max_n_send_or_recv; i++)
      {
        block_lengths[i] = 1;
      }
 
      // perform the sends and receives
      Vector<MPI_Request> requests;
      for (unsigned p = 0; p < nproc; p++)
      {
        // send and recv with other processors
        if (p != my_rank)
        {
          // send
          if (nblock_send[p] > 0)
          {
            // create the datatypes vector
            MPI_Datatype block_send_types[nblock_send[p]];
 
            // create the datatypes
            unsigned ptr = 0;
            for (unsigned b = 0; b < nblock; b++)
            {
              const unsigned required_block = block_vec_number[b];
 
              if (Nrows_to_send_for_get_block(required_block, p) > 0)
              {
                MPI_Type_indexed(Nrows_to_send_for_get_block(required_block, p),
                                 block_lengths,
                                 Rows_to_send_for_get_block(required_block, p),
                                 MPI_DOUBLE,
                                 &block_send_types[ptr]);
                MPI_Type_commit(&block_send_types[ptr]);
                ptr++;
              }
            }
 
            // compute the displacements and lengths
            MPI_Aint displacements[nblock_send[p]];
            int lengths[nblock_send[p]];
            for (int i = 0; i < nblock_send[p]; i++)
            {
              lengths[i] = 1;
              displacements[i] = 0;
            }
 
            // build the final datatype
            MPI_Datatype type_send;
            MPI_Type_create_struct(nblock_send[p],
                                   lengths,
                                   displacements,
                                   block_send_types,
                                   &type_send);
            MPI_Type_commit(&type_send);
 
            // send
            MPI_Request send_req;
            MPI_Isend(const_cast<double*>(v.values_pt()),
                      1,
                      type_send,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &send_req);
            MPI_Type_free(&type_send);
            for (int i = 0; i < nblock_send[p]; i++)
            {
              MPI_Type_free(&block_send_types[i]);
            }
            requests.push_back(send_req);
          }
 
          // recv
          if (nblock_recv[p] > 0)
          {
            // create the datatypes vector
            MPI_Datatype block_recv_types[nblock_recv[p]];
 
            // and the displacements
            MPI_Aint displacements[nblock_recv[p]];
 
            // and the lengths
            int lengths[nblock_recv[p]];
 
            // all displacements are computed relative to s[0] values
            MPI_Aint displacements_base;
            MPI_Get_address(s[0].values_pt(), &displacements_base);
 
            // now build
            unsigned ptr = 0;
            for (unsigned b = 0; b < nblock; b++)
            {
              const unsigned required_block = block_vec_number[b];
 
              if (Nrows_to_recv_for_get_block(required_block, p) > 0)
              {
                MPI_Type_indexed(Nrows_to_recv_for_get_block(required_block, p),
                                 block_lengths,
                                 Rows_to_recv_for_get_block(required_block, p),
                                 MPI_DOUBLE,
                                 &block_recv_types[ptr]);
                MPI_Type_commit(&block_recv_types[ptr]);
                MPI_Get_address(s[b].values_pt(), &displacements[ptr]);
                displacements[ptr] -= displacements_base;
                lengths[ptr] = 1;
                ptr++;
              }
            }
 
            // build the final data type
            MPI_Datatype type_recv;
            MPI_Type_create_struct(nblock_recv[p],
                                   lengths,
                                   displacements,
                                   block_recv_types,
                                   &type_recv);
            MPI_Type_commit(&type_recv);
 
            // recv
            MPI_Request recv_req;
            MPI_Irecv(s[0].values_pt(),
                      1,
                      type_recv,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &recv_req);
            MPI_Type_free(&type_recv);
            for (int i = 0; i < nblock_recv[p]; i++)
            {
              MPI_Type_free(&block_recv_types[i]);
            }
            requests.push_back(recv_req);
          }
        }
 
        // communicate with self
        else
        {
          const double* v_values_pt = v.values_pt();
          for (unsigned b = 0; b < nblock; b++)
          {
            const unsigned required_block = block_vec_number[b];
 
            double* w_values_pt = s[b].values_pt();
            for (unsigned i = 0;
                 i < Nrows_to_send_for_get_block(required_block, p);
                 i++)
            {
              w_values_pt[Rows_to_recv_for_get_block(required_block, p)[i]] =
                v_values_pt[Rows_to_send_for_get_block(required_block, p)[i]];
            }
          }
        }
      }
 
      // and then just wait
      unsigned c = requests.size();
      Vector<MPI_Status> stat(c);
      if (c)
      {
        MPI_Waitall(c, &requests[0], &stat[0]);
      }
      delete[] block_lengths;
 
#else
      // throw error
      std::ostringstream error_message;
      error_message << "The preconditioner is distributed and on more than one "
                    << "processor. MPI is required.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
#endif
    }
  }
 
  //============================================================================
  /// A helper function, takes the naturally ordered vector and
  /// rearranges it into a vector of sub vectors corresponding to the blocks,
  /// so s[b][i] contains the i-th entry in the vector associated with block b.
  /// The block_vec_number indicates which blocks we want.
  /// These blocks and vectors are those corresponding to the internal blocks.
  /// Note: If the preconditioner is a subsidiary preconditioner then only the
  /// sub-vectors associated with the blocks of the subsidiary preconditioner
  /// will be included. Hence the length of v is master_nrow() whereas the
  /// total length of the s vectors is the sum of the Nrow of the sub vectors.
  /// This is simply a wrapper around the other internal_get_block_vectors(...)
  /// function with the identity block_vec_number vector.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::internal_get_block_vectors(
    const DoubleVector& v, Vector<DoubleVector>& s) const
  {
    // Number of block types
    const unsigned nblock = this->internal_nblock_types();
    Vector<unsigned> block_vec_number(nblock, 0);
    for (unsigned b = 0; b < nblock; b++)
    {
      block_vec_number[b] = b;
    }
 
    internal_get_block_vectors(block_vec_number, v, s);
  }
 
  //============================================================================
  /// Takes the vector of block vectors, s, and copies its entries into
  /// the naturally ordered vector, v. If this is a subsidiary block
  /// preconditioner only those entries in v that are associated with its
  /// blocks are affected. The block_vec_number indicates which block the
  /// vectors in s came from. The block number corresponds to the block
  /// numbers in this preconditioner.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::return_block_vectors(
    const Vector<unsigned>& block_vec_number,
    const Vector<DoubleVector>& s,
    DoubleVector& v) const
  {
#ifdef PARANOID
 
    // Check if v is built.
    if (!v.built())
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // v must have the same distribution as the upper-most master block
    // preconditioner, since the upper-most master block preconditioner
    // should have the same distribution as the matrix pointed to
    // by matrix_pt().
    if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the global vector v must match the "
              << " specified master_distribution_pt(). \n"
              << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // Check if the number of vectors in s is the same as the number of block
    // numbers described in block_vec_number.
    const unsigned para_block_vec_number_size = block_vec_number.size();
    const unsigned para_s_size = s.size();
    if (para_block_vec_number_size != para_s_size)
    {
      std::ostringstream err_msg;
      err_msg << "block_vec_number.size() is " << para_block_vec_number_size
              << "\n."
              << "s.size() is " << para_s_size << ".\n"
              << "But they must be the same size!\n";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // Check to see if there are more blocks defined in the block_vec_number
    // vector than the number of block types. This is not allowed.
    const unsigned para_n_block = nblock_types();
    if (para_block_vec_number_size > para_n_block)
    {
      std::ostringstream err_msg;
      err_msg << "Trying to return " << para_block_vec_number_size
              << " block vectors.\n"
              << "But there are only " << para_n_block << " block types.\n";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // Check if any block numbers defined in block_vec_number is equal to or
    // greater than the number of block types.
    // E.g. if there are 5 block types, we can only have block numbers:
    //  0, 1, 2, 3 and 4.
    for (unsigned b = 0; b < para_block_vec_number_size; b++)
    {
      const unsigned para_required_block = block_vec_number[b];
      if (para_required_block > para_n_block)
      {
        std::ostringstream err_msg;
        err_msg << "block_vec_number[" << b << "] is " << para_required_block
                << ".\n"
                << "But there are only " << para_n_block << " block types.\n";
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
    }
 
    // Check that no block number is inserted twice.
    std::set<unsigned> para_set;
    for (unsigned b = 0; b < para_block_vec_number_size; b++)
    {
      std::pair<std::set<unsigned>::iterator, bool> para_set_ret;
      para_set_ret = para_set.insert(block_vec_number[b]);
 
      if (!para_set_ret.second)
      {
        std::ostringstream err_msg;
        err_msg << "Error: the block number " << block_vec_number[b]
                << " appears twice.\n";
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
    }
 
    // Check to see that all the vectors in s are built
    // (since we are trying to return them).
    for (unsigned b = 0; b < para_block_vec_number_size; b++)
    {
      if (!s[b].built())
      {
        std::ostringstream err_msg;
        err_msg << "The distribution of the block vector s[" << b
                << "] must be setup.\n";
        throw OomphLibError(
          err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
    }
 
    // Since these are built, we check that the distributions are correct.
    // This are incorrect if the block numbers in block_vec_number and
    // the vectors in s does not match.
    for (unsigned b = 0; b < para_block_vec_number_size; b++)
    {
      if (*(s[b].distribution_pt()) !=
          *(Block_distribution_pt[block_vec_number[b]]))
      {
        std::ostringstream error_message;
        error_message
          << "The distribution of the block vector " << b << " must match the"
          << " specified distribution at "
          << "Block_distribution_pt[" << block_vec_number[b] << "].\n"
          << "The distribution of the Block_distribution_pt is determined by\n"
          << "the vector block_vec_number. Perhaps it is incorrect?\n";
        throw OomphLibError(error_message.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // Number of blocks to get.
    const unsigned n_block = block_vec_number.size();
 
    // Each block is made of dof types. We get the most fine grain dof types.
    // Most fine grain in the sense that these are the dof types that belongs
    // in this block before any coarsening of dof types has taken place.
    // The ordering of the dof types matters, this is handled properly when
    // creating the Block_to_dof_map_fine vector and must be respected here.
    // I.e. we cannot arbitrarily insert dof types (even if they are correct)
    // in the vector most_fine_grain_dof.
    Vector<unsigned> most_fine_grain_dof;
    for (unsigned b = 0; b < n_block; b++)
    {
      const unsigned mapped_b = block_vec_number[b];
 
      most_fine_grain_dof.insert(most_fine_grain_dof.end(),
                                 Block_to_dof_map_fine[mapped_b].begin(),
                                 Block_to_dof_map_fine[mapped_b].end());
    }
 
    // Split all the blocks into it's most fine grain dof vector.
    Vector<DoubleVector> dof_vector(most_fine_grain_dof.size());
 
    unsigned offset = 0;
 
    // Perform the splitting for each block.
    for (unsigned b = 0; b < n_block; b++)
    {
      // The actual block number.
      const unsigned mapped_b = block_vec_number[b];
 
      // How many most fine grain dof types are associated with this block?
      const unsigned ndof = Block_to_dof_map_fine[mapped_b].size();
 
      if (ndof == 1)
      // No need to split, just copy.
      {
        dof_vector[offset] = s[b];
      }
      else
      // Need to split s[b] into it's most fine grain dof vectors
      {
        // To store pointers to the dof vectors associated with this block.
        Vector<DoubleVector*> tmp_dof_vector_pt(ndof, 0);
 
        for (unsigned d = 0; d < ndof; d++)
        {
          const unsigned offset_plus_d = offset + d;
 
          // build the dof vector.
          dof_vector[offset_plus_d].build(
            Internal_block_distribution_pt[most_fine_grain_dof[offset_plus_d]]);
 
          // Store the pointer.
          tmp_dof_vector_pt[d] = &dof_vector[offset_plus_d];
        }
 
        // Split without communication.
        DoubleVectorHelpers::split_without_communication(s[b],
                                                         tmp_dof_vector_pt);
      }
 
      // Update the offset!
      offset += ndof;
    }
 
    // Return the block vectors all in one go.
    internal_return_block_vectors(most_fine_grain_dof, dof_vector, v);
  } // return_block_vectors(...)
 
 
  //============================================================================
  /// Takes the vector of block vectors, s, and copies its entries into
  /// the naturally ordered vector, v. If this is a subsidiary block
  /// preconditioner only those entries in v that are associated with its
  /// blocks are affected. The block_vec_number indicates which block the
  /// vectors in s came from. The block number corresponds to the block
  /// numbers in this preconditioner.
  /// This is simply a wrapper around the other return_block_vectors(...)
  /// function where the block_vec_number Vector is the identity, i.e.
  /// block_vec_number is [0, 1, ..., nblock_types - 1].
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::return_block_vectors(
    const Vector<DoubleVector>& s, DoubleVector& v) const
  {
    // The number of block types in this preconditioner.
    const unsigned n_block = nblock_types();
 
    // Create the identity vector.
    Vector<unsigned> required_block(n_block, 0);
    for (unsigned i = 0; i < n_block; i++)
    {
      required_block[i] = i;
    }
 
    // Call the other return_block_vectors function which does the work.
    return_block_vectors(required_block, s, v);
  } // return_block_vectors(...)
 
  //============================================================================
  /// Takes the naturally ordered vector and
  /// rearranges it into a vector of sub vectors corresponding to the blocks,
  /// so s[b][i] contains the i-th entry in the vector associated with block b.
  /// The block_vec_number indicates which blocks we want.
  /// These blocks and vectors are those corresponding to the internal blocks.
  /// Note: If the preconditioner is a subsidiary preconditioner then only the
  /// sub-vectors associated with the blocks of the subsidiary preconditioner
  /// will be included. Hence the length of v is master_nrow() whereas the
  /// total length of the s vectors is the sum of the Nrow of the sub vectors.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::internal_return_block_vectors(
    const Vector<unsigned>& block_vec_number,
    const Vector<DoubleVector>& s,
    DoubleVector& v) const
  {
    // the number of blocks
    const unsigned nblock = block_vec_number.size();
 
#ifdef PARANOID
    if (!v.built())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must match the "
                    << " specified master_distribution_pt(). \n"
                    << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    for (unsigned b = 0; b < nblock; b++)
    {
      if (!s[b].built())
      {
        std::ostringstream error_message;
        error_message << "The distribution of the block vector " << b
                      << " must be setup.";
        throw OomphLibError(error_message.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
      const unsigned required_block = block_vec_number[b];
      if (*(s[b].distribution_pt()) !=
          *(Internal_block_distribution_pt[required_block]))
      {
        std::ostringstream error_message;
        error_message
          << "The distribution of the block vector " << b << " must match the"
          << " specified distribution at Internal_block_distribution_pt[" << b
          << "]";
        throw OomphLibError(error_message.str(),
                            OOMPH_CURRENT_FUNCTION,
                            OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // if + only one processor
    //    + more than one processor but matrix_pt is not distributed
    // then use the serial get_block method
    if (this->distribution_pt()->communicator_pt()->nproc() == 1 ||
        !this->distribution_pt()->distributed())
    {
      double* v_pt = v.values_pt();
      for (unsigned b = 0; b < nblock; b++)
      {
        const unsigned required_block = block_vec_number[b];
 
        const double* s_pt = s[b].values_pt();
        unsigned nrow = this->internal_block_dimension(required_block);
        for (unsigned i = 0; i < nrow; i++)
        {
          v_pt[this->Global_index[required_block][i]] = s_pt[i];
        }
      }
    }
    // otherwise use mpi
    else
    {
#ifdef OOMPH_HAS_MPI
 
      // my rank
      unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
 
      // the number of processors
      unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
 
      // determine the maximum number of rows to be sent or recv
      // and determine the number of blocks each processor will send and recv
      // communication for
      Vector<int> nblock_send(nproc, 0);
      Vector<int> nblock_recv(nproc, 0);
      unsigned max_n_send_or_recv = 0;
      for (unsigned p = 0; p < nproc; p++)
      {
        for (unsigned b = 0; b < nblock; b++)
        {
          const unsigned required_block = block_vec_number[b];
 
          max_n_send_or_recv = std::max(
            max_n_send_or_recv, Nrows_to_send_for_get_block(required_block, p));
          max_n_send_or_recv = std::max(
            max_n_send_or_recv, Nrows_to_recv_for_get_block(required_block, p));
          if (Nrows_to_send_for_get_block(required_block, p) > 0)
          {
            nblock_recv[p]++;
          }
          if (Nrows_to_recv_for_get_block(required_block, p) > 0)
          {
            nblock_send[p]++;
          }
        }
      }
 
      // create a vectors of 1s the size of the nblock for the mpi indexed
      // data types
      int* block_lengths = new int[max_n_send_or_recv];
      for (unsigned i = 0; i < max_n_send_or_recv; i++)
      {
        block_lengths[i] = 1;
      }
 
      // perform the sends and receives
      Vector<MPI_Request> requests;
      for (unsigned p = 0; p < nproc; p++)
      {
        // send and recv with other processors
        if (p != my_rank)
        {
          // recv
          if (nblock_recv[p] > 0)
          {
            // create the datatypes vector
            MPI_Datatype block_recv_types[nblock_recv[p]];
 
            // create the datatypes
            unsigned ptr = 0;
            for (unsigned b = 0; b < nblock; b++)
            {
              const unsigned required_block = block_vec_number[b];
 
              if (Nrows_to_send_for_get_block(required_block, p) > 0)
              {
                MPI_Type_indexed(Nrows_to_send_for_get_block(required_block, p),
                                 block_lengths,
                                 Rows_to_send_for_get_block(required_block, p),
                                 MPI_DOUBLE,
                                 &block_recv_types[ptr]);
                MPI_Type_commit(&block_recv_types[ptr]);
                ptr++;
              }
            }
 
            // compute the displacements and lengths
            MPI_Aint displacements[nblock_recv[p]];
            int lengths[nblock_recv[p]];
            for (int i = 0; i < nblock_recv[p]; i++)
            {
              lengths[i] = 1;
              displacements[i] = 0;
            }
 
            // build the final datatype
            MPI_Datatype type_recv;
            MPI_Type_create_struct(nblock_recv[p],
                                   lengths,
                                   displacements,
                                   block_recv_types,
                                   &type_recv);
            MPI_Type_commit(&type_recv);
 
            // recv
            MPI_Request recv_req;
            MPI_Irecv(v.values_pt(),
                      1,
                      type_recv,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &recv_req);
            MPI_Type_free(&type_recv);
            for (int i = 0; i < nblock_recv[p]; i++)
            {
              MPI_Type_free(&block_recv_types[i]);
            }
            requests.push_back(recv_req);
          }
 
          // send
          if (nblock_send[p] > 0)
          {
            // create the datatypes vector
            MPI_Datatype block_send_types[nblock_send[p]];
 
            // and the displacements
            MPI_Aint displacements[nblock_send[p]];
 
            // and the lengths
            int lengths[nblock_send[p]];
 
            // all displacements are computed relative to s[0] values
            MPI_Aint displacements_base;
            MPI_Get_address(const_cast<double*>(s[0].values_pt()),
                            &displacements_base);
 
            // now build
            unsigned ptr = 0;
            for (unsigned b = 0; b < nblock; b++)
            {
              const unsigned required_block = block_vec_number[b];
 
              if (Nrows_to_recv_for_get_block(required_block, p) > 0)
              {
                MPI_Type_indexed(Nrows_to_recv_for_get_block(required_block, p),
                                 block_lengths,
                                 Rows_to_recv_for_get_block(required_block, p),
                                 MPI_DOUBLE,
                                 &block_send_types[ptr]);
                MPI_Type_commit(&block_send_types[ptr]);
                MPI_Get_address(const_cast<double*>(s[b].values_pt()),
                                &displacements[ptr]);
                displacements[ptr] -= displacements_base;
                lengths[ptr] = 1;
                ptr++;
              }
            }
 
            // build the final data type
            MPI_Datatype type_send;
            MPI_Type_create_struct(nblock_send[p],
                                   lengths,
                                   displacements,
                                   block_send_types,
                                   &type_send);
            MPI_Type_commit(&type_send);
 
            // send
            MPI_Request send_req;
            MPI_Isend(const_cast<double*>(s[0].values_pt()),
                      1,
                      type_send,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &send_req);
            MPI_Type_free(&type_send);
            for (int i = 0; i < nblock_send[p]; i++)
            {
              MPI_Type_free(&block_send_types[i]);
            }
            requests.push_back(send_req);
          }
        }
 
        // communicate wih self
        else
        {
          double* v_values_pt = v.values_pt();
          for (unsigned b = 0; b < nblock; b++)
          {
            const unsigned required_block = block_vec_number[b];
 
            const double* w_values_pt = s[b].values_pt();
            for (unsigned i = 0;
                 i < Nrows_to_send_for_get_block(required_block, p);
                 i++)
            {
              v_values_pt[Rows_to_send_for_get_block(required_block, p)[i]] =
                w_values_pt[Rows_to_recv_for_get_block(required_block, p)[i]];
            }
          }
        }
      }
 
      // and then just wait
      unsigned c = requests.size();
      Vector<MPI_Status> stat(c);
      if (c)
      {
        MPI_Waitall(c, &requests[0], &stat[0]);
      }
      delete[] block_lengths;
 
#else
      // throw error
      std::ostringstream error_message;
      error_message << "The preconditioner is distributed and on more than one "
                    << "processor. MPI is required.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
#endif
    }
  }
 
  //============================================================================
  /// A helper function, takes the naturally ordered vector and
  /// rearranges it into a vector of sub vectors corresponding to the blocks,
  /// so s[b][i] contains the i-th entry in the vector associated with block b.
  /// The block_vec_number indicates which blocks we want.
  /// These blocks and vectors are those corresponding to the internal blocks.
  /// Note: If the preconditioner is a subsidiary preconditioner then only the
  /// sub-vectors associated with the blocks of the subsidiary preconditioner
  /// will be included. Hence the length of v is master_nrow() whereas the
  /// total length of the s vectors is the sum of the Nrow of the sub vectors.
  /// This is simply a wrapper around the other internal_get_block_vectors(...)
  /// function with the identity block_vec_number vector.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::internal_return_block_vectors(
    const Vector<DoubleVector>& s, DoubleVector& v) const
  {
    // the number of blocks
    const unsigned nblock = this->internal_nblock_types();
    Vector<unsigned> block_vec_number(nblock, 0);
    for (unsigned b = 0; b < nblock; b++)
    {
      block_vec_number[b] = b;
    }
 
    internal_return_block_vectors(block_vec_number, s, v);
  }
 
  //============================================================================
  /// A helper function, takes the naturally ordered vector, v,
  /// and extracts the n-th block vector, b.
  /// Here n is the block number in the current preconditioner.
  /// NOTE: The ordering of the vector b is the same as the
  /// ordering of the block matrix from internal_get_block(...).
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::internal_get_block_vector(
    const unsigned& b, const DoubleVector& v, DoubleVector& w) const
  {
#ifdef PARANOID
    // the number of blocks
    const unsigned n_blocks = this->internal_nblock_types();
 
    // paranoid check that block i is in this block preconditioner
    if (b >= n_blocks)
    {
      std::ostringstream error_message;
      error_message
        << "Requested block  vector " << b
        << ", however this preconditioner has internal_nblock_types() "
        << "= " << internal_nblock_types() << std::endl;
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (!v.built())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must match the "
                    << " specified master_distribution_pt(). \n"
                    << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // rebuild the block vector
    w.build(Internal_block_distribution_pt[b], 0.0);
 
    // if + only one processor
    //    + more than one processor but matrix_pt is not distributed
    // then use the serial get_block method
    if (this->distribution_pt()->communicator_pt()->nproc() == 1 ||
        !this->distribution_pt()->distributed())
    {
      double* w_pt = w.values_pt();
      const double* v_pt = v.values_pt();
      unsigned n_row = w.nrow();
      for (unsigned i = 0; i < n_row; i++)
      {
        w_pt[i] = v_pt[this->Global_index[b][i]];
      }
    }
    // otherwise use mpi
    else
    {
#ifdef OOMPH_HAS_MPI
 
      // my rank
      unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
 
      // the number of processors
      unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
 
      // determine the maximum number of rows to be sent or recv
      unsigned max_n_send_or_recv = 0;
      for (unsigned p = 0; p < nproc; p++)
      {
        max_n_send_or_recv =
          std::max(max_n_send_or_recv, Nrows_to_send_for_get_block(b, p));
        max_n_send_or_recv =
          std::max(max_n_send_or_recv, Nrows_to_recv_for_get_block(b, p));
      }
 
      // create a vectors of 1s (the size of the nblock for the mpi indexed
      // data types
      int* block_lengths = new int[max_n_send_or_recv];
      for (unsigned i = 0; i < max_n_send_or_recv; i++)
      {
        block_lengths[i] = 1;
      }
 
      // perform the sends and receives
      Vector<MPI_Request> requests;
      for (unsigned p = 0; p < nproc; p++)
      {
        // send and recv with other processors
        if (p != my_rank)
        {
          if (Nrows_to_send_for_get_block(b, p) > 0)
          {
            // create the send datatype
            MPI_Datatype type_send;
            MPI_Type_indexed(Nrows_to_send_for_get_block(b, p),
                             block_lengths,
                             Rows_to_send_for_get_block(b, p),
                             MPI_DOUBLE,
                             &type_send);
            MPI_Type_commit(&type_send);
 
            // send
            MPI_Request send_req;
            MPI_Isend(const_cast<double*>(v.values_pt()),
                      1,
                      type_send,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &send_req);
            MPI_Type_free(&type_send);
            requests.push_back(send_req);
          }
 
          if (Nrows_to_recv_for_get_block(b, p) > 0)
          {
            // create the recv datatype
            MPI_Datatype type_recv;
            MPI_Type_indexed(Nrows_to_recv_for_get_block(b, p),
                             block_lengths,
                             Rows_to_recv_for_get_block(b, p),
                             MPI_DOUBLE,
                             &type_recv);
            MPI_Type_commit(&type_recv);
 
            // recv
            MPI_Request recv_req;
            MPI_Irecv(w.values_pt(),
                      1,
                      type_recv,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &recv_req);
            MPI_Type_free(&type_recv);
            requests.push_back(recv_req);
          }
        }
 
        // communicate with self
        else
        {
          double* w_values_pt = w.values_pt();
          const double* v_values_pt = v.values_pt();
          for (unsigned i = 0; i < Nrows_to_send_for_get_block(b, p); i++)
          {
            w_values_pt[Rows_to_recv_for_get_block(b, p)[i]] =
              v_values_pt[Rows_to_send_for_get_block(b, p)[i]];
          }
        }
      }
 
      // and then just wait
      unsigned c = requests.size();
      Vector<MPI_Status> stat(c);
      if (c)
      {
        MPI_Waitall(c, &requests[0], &stat[0]);
      }
      delete[] block_lengths;
 
#else
      // throw error
      std::ostringstream error_message;
      error_message << "The preconditioner is distributed and on more than one "
                    << "processor. MPI is required.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
#endif
    }
  }
 
  //============================================================================
  /// Takes the naturally ordered vector, v and returns the n-th
  /// block vector, b. Here n is the block number in the current
  /// preconditioner.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::get_block_vector(const unsigned& b,
                                                     const DoubleVector& v,
                                                     DoubleVector& w) const
  {
#ifdef PARANOID
    // the number of blocks
    const unsigned para_n_blocks = nblock_types();
 
    // paranoid check that block i is in this block preconditioner
    if (b >= para_n_blocks)
    {
      std::ostringstream err_msg;
      err_msg << "Requested block vector " << b
              << ", however this preconditioner has only " << para_n_blocks
              << " block types"
              << ".\n";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    if (!v.built())
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the global vector v must match the "
              << " specified master_distribution_pt(). \n"
              << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // Recall that, the relationship between the external blocks and the
    // external dof types, as seen by the preconditioner writer is stored in the
    // mapping Block_to_dof_map_coarse.
    //
    // However, each dof type could have been coarsened! The relationship
    // between the dof types of this preconditioner and the parent
    // preconditioner is stored in the mapping Doftype_coarsen_map_coarse. The
    // dof numbers in this map is relative to this preconditioner.
    //
    // Finally, the relationship between the dof types of this preconditioner
    // and the most fine grain dof types is stored in the mapping
    // Doftype_coarsen_map_fine. Again, the dof numbers in this map is relative
    // to this preconditioner.
    //
    // Furthermore, we note that concatenation of vectors without communication
    //  is associative, but not commutative. I.e.
    // (V1+V2)+V3 = V1 + (V2 + V3), where + is concatenation without
    // communication.
    //
    // So all we need is the vectors listed in the correct order.
    //
    // We need only Block_to_dof_map_coarse to tell us which external dof types
    // are in this block, then Doftype_coarsen_map_fine to tell us which most
    // fine grain dofs to concatenate!
    //
    // All the mapping vectors are constructed to respect the ordering of
    // the dof types.
 
    // Get the most fine grain block to dof mapping.
    Vector<unsigned> most_fine_grain_dof = Block_to_dof_map_fine[b];
 
    // How many vectors do we need to concatenate?
    const unsigned n_dof_vec = most_fine_grain_dof.size();
 
    if (n_dof_vec == 1)
    // No need to concatenate, just extract the vector.
    {
      internal_get_block_vector(most_fine_grain_dof[0], v, w);
    }
    else
    // Need to concatenate dof-level vectors.
    {
      Vector<DoubleVector> dof_vector(n_dof_vec);
 
      // Get all the dof-level vectors in one go
      internal_get_block_vectors(most_fine_grain_dof, v, dof_vector);
      // Build w with the correct distribution.
      w.build(Block_distribution_pt[b], 0);
 
      // Concatenate the vectors.
      DoubleVectorHelpers::concatenate_without_communication(dof_vector, w);
 
      dof_vector.clear();
    }
  } // get_block_vector(...)
 
  //============================================================================
  /// Takes the n-th block ordered vector, b,  and copies its entries
  /// to the appropriate entries in the naturally ordered vector, v.
  /// Here n is the block number in the current block preconditioner.
  /// If the preconditioner is a subsidiary block preconditioner
  /// the other entries in v  that are not associated with it
  /// are left alone.
  ///
  /// This version works with the internal block types. This is legacy code
  /// but is kept alive, hence moved to private. Please use the
  /// function "return_block_vector(...)".
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::internal_return_block_vector(
    const unsigned& b, const DoubleVector& w, DoubleVector& v) const
  {
#ifdef PARANOID
    // the number of blocks
    const unsigned n_blocks = this->internal_nblock_types();
 
    // paranoid check that block i is in this block preconditioner
    if (b >= n_blocks)
    {
      std::ostringstream error_message;
      error_message
        << "Requested block  vector " << b
        << ", however this preconditioner has internal_nblock_types() "
        << "= " << internal_nblock_types() << std::endl;
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (!v.built())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (*v.distribution_pt() != *this->master_distribution_pt())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must match the "
                    << " specified master_distribution_pt(). \n"
                    << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (!w.built())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the block vector w must be setup.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (*w.distribution_pt() != *Internal_block_distribution_pt[b])
    {
      std::ostringstream error_message;
      error_message
        << "The distribution of the block vector w must match the "
        << " specified distribution at Internal_block_distribution_pt[b]";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // if + only one processor
    //    + more than one processor but matrix_pt is not distributed
    // then use the serial get_block method
    if (this->distribution_pt()->communicator_pt()->nproc() == 1 ||
        !this->distribution_pt()->distributed())
    {
      // length of vector
      unsigned n_row = this->internal_block_dimension(b);
 
      // copy back from the block vector to the naturally ordered vector
      double* v_pt = v.values_pt();
      const double* w_pt = w.values_pt();
      for (unsigned i = 0; i < n_row; i++)
      {
        v_pt[this->Global_index[b][i]] = w_pt[i];
      }
    }
    // otherwise use mpi
    else
    {
#ifdef OOMPH_HAS_MPI
 
      // my rank
      unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
 
      // the number of processors
      unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
 
      // determine the maximum number of rows to be sent or recv
      unsigned max_n_send_or_recv = 0;
      for (unsigned p = 0; p < nproc; p++)
      {
        max_n_send_or_recv =
          std::max(max_n_send_or_recv, Nrows_to_send_for_get_block(b, p));
        max_n_send_or_recv =
          std::max(max_n_send_or_recv, Nrows_to_recv_for_get_block(b, p));
      }
 
      // create a vectors of 1s (the size of the nblock for the mpi indexed
      // data types
      int* block_lengths = new int[max_n_send_or_recv];
      for (unsigned i = 0; i < max_n_send_or_recv; i++)
      {
        block_lengths[i] = 1;
      }
 
      // perform the sends and receives
      Vector<MPI_Request> requests;
      for (unsigned p = 0; p < nproc; p++)
      {
        // send and recv with other processors
        if (p != my_rank)
        {
          if (Nrows_to_recv_for_get_block(b, p) > 0)
          {
            // create the send datatype
            MPI_Datatype type_send;
            MPI_Type_indexed(Nrows_to_recv_for_get_block(b, p),
                             block_lengths,
                             Rows_to_recv_for_get_block(b, p),
                             MPI_DOUBLE,
                             &type_send);
            MPI_Type_commit(&type_send);
 
            // send
            MPI_Request send_req;
            MPI_Isend(const_cast<double*>(w.values_pt()),
                      1,
                      type_send,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &send_req);
            MPI_Type_free(&type_send);
            requests.push_back(send_req);
          }
 
          if (Nrows_to_send_for_get_block(b, p) > 0)
          {
            // create the recv datatype
            MPI_Datatype type_recv;
            MPI_Type_indexed(Nrows_to_send_for_get_block(b, p),
                             block_lengths,
                             Rows_to_send_for_get_block(b, p),
                             MPI_DOUBLE,
                             &type_recv);
            MPI_Type_commit(&type_recv);
 
            // recv
            MPI_Request recv_req;
            MPI_Irecv(v.values_pt(),
                      1,
                      type_recv,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &recv_req);
            MPI_Type_free(&type_recv);
            requests.push_back(recv_req);
          }
        }
 
        // communicate wih self
        else
        {
          const double* w_values_pt = w.values_pt();
          double* v_values_pt = v.values_pt();
          for (unsigned i = 0; i < Nrows_to_send_for_get_block(b, p); i++)
          {
            v_values_pt[Rows_to_send_for_get_block(b, p)[i]] =
              w_values_pt[Rows_to_recv_for_get_block(b, p)[i]];
          }
        }
      }
 
      // and then just wait
      unsigned c = requests.size();
      Vector<MPI_Status> stat(c);
      if (c)
      {
        MPI_Waitall(c, &requests[0], &stat[0]);
      }
      delete[] block_lengths;
 
#else
      // throw error
      std::ostringstream error_message;
      error_message << "The preconditioner is distributed and on more than one "
                    << "processor. MPI is required.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
#endif
    }
  }
 
  //============================================================================
  /// Takes the n-th block ordered vector, b,  and copies its entries
  /// to the appropriate entries in the naturally ordered vector, v.
  /// Here n is the block number in the current block preconditioner.
  /// If the preconditioner is a subsidiary block preconditioner
  /// the other entries in v  that are not associated with it
  /// are left alone.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::return_block_vector(const unsigned& n,
                                                        const DoubleVector& b,
                                                        DoubleVector& v) const
  {
#ifdef PARANOID
    // the number of blocks
    const unsigned para_n_blocks = nblock_types();
 
    // paranoid check that block i is in this block preconditioner
    if (n >= para_n_blocks)
    {
      std::ostringstream err_msg;
      err_msg << "Requested block vector " << b
              << ", however this preconditioner has " << para_n_blocks
              << " block types.\n";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (!v.built())
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (*v.distribution_pt() != *this->master_distribution_pt())
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the global vector v must match the "
              << " specified master_distribution_pt(). \n"
              << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (!b.built())
    {
      std::ostringstream err_msg;
      err_msg << "The distribution of the block vector b must be setup.";
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
#endif
 
    // Get the most fine grain dof
    Vector<unsigned> most_fine_grain_dof = Block_to_dof_map_fine[n];
 
    // How many dofs are in this block?
    const unsigned n_dof_vec = Block_to_dof_map_fine[n].size();
 
    if (n_dof_vec == 1)
    // There is only one dof, no need to split.
    {
      internal_return_block_vector(most_fine_grain_dof[0], b, v);
    }
    else
    // Need to split the vector up before we insert them all in one go.
    {
      Vector<DoubleVector> dof_vector(n_dof_vec);
      for (unsigned d = 0; d < n_dof_vec; d++)
      {
        dof_vector[d].build(
          internal_block_distribution_pt(most_fine_grain_dof[d]));
      }
 
      DoubleVectorHelpers::split_without_communication(b, dof_vector);
 
      // return to v
      internal_return_block_vectors(most_fine_grain_dof, dof_vector, v);
    }
  } // return_block_vector(...)
 
  //============================================================================
  /// Given the naturally ordered vector, v, return
  /// the vector rearranged in block order in w. This is a legacy function
  /// from the old block preconditioning framework. Kept alive in case it may
  /// be needed again.
  ///
  /// This uses the variables ending in "get_ordered". We no longer use this
  /// type of method. This function copy values from v and re-order them
  /// in "block order" and place them in w. Block order means that the
  /// values in w are the same as the concatenated block vectors.
  ///
  /// I.e. - v is naturally ordered.
  ///        v -> s_b, v is ordered into blocks vectors
  ///                  (requires communication)
  ///        concatenate_without_communication(s_{0,...,nblocks},w) gives w.
  ///
  /// But this function skips out the concatenation part and builds w directly
  /// from v.
  ///
  /// This is nice but the function is implemented in such a way that it
  /// always use all the (internal) blocks and concatenated with the
  /// identity ordering. I.e. if this preconditioner has 3 block types, then
  /// w will always be:
  /// concatenate_without_communication([s_0, s_1, s_2], w). There is easy
  /// way to change this.
  ///
  /// Furthermore, it does not take into account the new dof type coarsening
  /// feature. So this function will most likely produce the incorrect vector
  /// w from what the user intended. It still works, but w will be the
  /// concatenation of the most fine grain dof block vectors with the
  /// "natural" dof type ordering.
  ///
  /// This has been superseded by the function
  /// get_block_ordered_preconditioner_vector(...) which does the correct
  /// thing.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::
    internal_get_block_ordered_preconditioner_vector(const DoubleVector& v,
                                                     DoubleVector& w) const
  {
#ifdef PARANOID
    if (!v.built())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (*v.distribution_pt() != *this->master_distribution_pt())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must match the "
                    << " specified master_distribution_pt(). \n"
                    << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    //  Cleared and resized w for reordered vector
    w.build(this->internal_preconditioner_matrix_distribution_pt(), 0.0);
 
    // if + only one processor
    //    + more than one processor but matrix_pt is not distributed
    // then use the serial get_block method
    if (this->distribution_pt()->communicator_pt()->nproc() == 1 ||
        !this->distribution_pt()->distributed())
    {
      // number of blocks
      unsigned nblock = this->Internal_nblock_types;
 
      // copy to w
      unsigned block_offset = 0;
      double* w_pt = w.values_pt();
      const double* v_pt = v.values_pt();
      for (unsigned b = 0; b < nblock; b++)
      {
        unsigned block_nrow = this->internal_block_dimension(b);
        for (unsigned i = 0; i < block_nrow; i++)
        {
          w_pt[block_offset + i] = v_pt[this->Global_index[b][i]];
        }
        block_offset += block_nrow;
      }
    }
    // otherwise use mpi
    else
    {
#ifdef OOMPH_HAS_MPI
 
      // my rank
      unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
 
      // the number of processors
      unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
 
      // determine the maximum number of rows to be sent or recv
      unsigned max_n_send_or_recv = 0;
      for (unsigned p = 0; p < nproc; p++)
      {
        max_n_send_or_recv =
          std::max(max_n_send_or_recv, Nrows_to_send_for_get_ordered[p]);
        max_n_send_or_recv =
          std::max(max_n_send_or_recv, Nrows_to_recv_for_get_ordered[p]);
      }
 
      // create a vectors of 1s (the size of the nblock for the mpi indexed
      // data types
      int* block_lengths = new int[max_n_send_or_recv];
      for (unsigned i = 0; i < max_n_send_or_recv; i++)
      {
        block_lengths[i] = 1;
      }
 
      // perform the sends and receives
      Vector<MPI_Request> requests;
      for (unsigned p = 0; p < nproc; p++)
      {
        // send and recv with other processors
        if (p != my_rank)
        {
          if (Nrows_to_send_for_get_ordered[p] > 0)
          {
            // create the send datatype
            MPI_Datatype type_send;
            MPI_Type_indexed(Nrows_to_send_for_get_ordered[p],
                             block_lengths,
                             Rows_to_send_for_get_ordered[p],
                             MPI_DOUBLE,
                             &type_send);
            MPI_Type_commit(&type_send);
 
            // send
            MPI_Request send_req;
            MPI_Isend(const_cast<double*>(v.values_pt()),
                      1,
                      type_send,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &send_req);
            MPI_Type_free(&type_send);
            requests.push_back(send_req);
          }
 
          if (Nrows_to_recv_for_get_ordered[p] > 0)
          {
            // create the recv datatype
            MPI_Datatype type_recv;
            MPI_Type_indexed(Nrows_to_recv_for_get_ordered[p],
                             block_lengths,
                             Rows_to_recv_for_get_ordered[p],
                             MPI_DOUBLE,
                             &type_recv);
            MPI_Type_commit(&type_recv);
 
            // recv
            MPI_Request recv_req;
            MPI_Irecv(w.values_pt(),
                      1,
                      type_recv,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &recv_req);
            MPI_Type_free(&type_recv);
            requests.push_back(recv_req);
          }
        }
 
        // communicate with self
        else
        {
          double* w_values_pt = w.values_pt();
          const double* v_values_pt = v.values_pt();
          for (unsigned i = 0; i < Nrows_to_send_for_get_ordered[p]; i++)
          {
            w_values_pt[Rows_to_recv_for_get_ordered[p][i]] =
              v_values_pt[Rows_to_send_for_get_ordered[p][i]];
          }
        }
      }
 
      // and then just wait
      unsigned c = requests.size();
      Vector<MPI_Status> stat(c);
      if (c)
      {
        MPI_Waitall(c, &requests[0], &stat[0]);
      }
      delete[] block_lengths;
 
#else
      // throw error
      std::ostringstream error_message;
      error_message << "The preconditioner is distributed and on more than one "
                    << "processor. MPI is required.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
#endif
    }
  }
 
  //============================================================================
  /// Given the naturally ordered vector, v, return
  /// the vector rearranged in block order in w. This function calls
  /// get_concatenated_block_vector(...) with the identity block mapping.
  ///
  /// This function has been re-written to work with the new dof type
  /// coarsening feature. The old function is kept alive in
  /// internal_get_block_ordered_preconditioner_vector(...) and is moved to
  /// the private section of the code. The differences between the two are:
  ///
  /// 1) This function extracts all the block vectors (in one go) via the
  ///    function internal_get_block_vectors(...), and concatenates them.
  ///
  /// 2) The old function makes use of the variables ending in "get_ordered",
  ///    thus is slightly more efficient since it does not have to concatenate
  ///    any block vectors.
  ///
  /// 3) The old function no longer respect the new indirections if dof types
  ///    have been coarsened.
  ///
  /// 4) This function extracts the most fine grain dof-level vectors and
  ///    concatenates them. These dof-level vectors respect the re-ordering
  ///    caused by the coarsening of dof types. The overhead associated with
  ///    concatenating DoubleVectors without communication is very small.
  ///
  /// This function should be used.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::get_block_ordered_preconditioner_vector(
    const DoubleVector& v, DoubleVector& w)
  {
#ifdef PARANOID
    if (!v.built())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (*v.distribution_pt() != *this->master_distribution_pt())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must match the "
                    << " specified master_distribution_pt(). \n"
                    << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // Get the number of blocks.
    unsigned nblocks = this->nblock_types();
 
    // Fill in the identity mapping.
    Vector<unsigned> block_vec_number(nblocks, 0);
    for (unsigned b = 0; b < nblocks; b++)
    {
      block_vec_number[b] = b;
    }
 
    // Do the work.
    get_concatenated_block_vector(block_vec_number, v, w);
  } // get_block_ordered_preconditioner_vector(...)
 
  //============================================================================
  /// Takes the block ordered vector, w, and reorders it in the natural
  /// order. Reordered vector is returned in v. Note: If the preconditioner is
  /// a subsidiary preconditioner then only the components of the vector
  /// associated with the blocks of the subsidiary preconditioner will be
  /// included. Hence the length of v is master_nrow() whereas that of the
  /// vector w is of length this->nrow().
  ///
  /// This is the return function for the function
  /// internal_get_block_ordered_preconditioner_vector(...).
  /// Both internal_get_block_ordered_preconditioner_vector(...) and
  /// internal_return_block_ordered_preconditioner_vector(...) has been
  /// superseded by the functions
  ///
  /// get_block_ordered_preconditioner_vector(...) and
  /// return_block_ordered_preconditioner_vector(...),
  ///
  /// Thus this function is moved to the private section of the code.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::
    internal_return_block_ordered_preconditioner_vector(const DoubleVector& w,
                                                        DoubleVector& v) const
  {
#ifdef PARANOID
    if (!v.built())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (*v.distribution_pt() != *this->master_distribution_pt())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must match the "
                    << " specified master_distribution_pt(). \n"
                    << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (!w.built())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the block vector w must be setup.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (*w.distribution_pt() !=
        *this->internal_preconditioner_matrix_distribution_pt())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the block vector w must match the "
                    << " specified distribution at Distribution_pt[b]";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
 
    // if + only one processor
    //    + more than one processor but matrix_pt is not distributed
    // then use the serial get_block method
    if (this->distribution_pt()->communicator_pt()->nproc() == 1 ||
        !this->distribution_pt()->distributed())
    {
      // number of blocks
      unsigned nblock = this->Internal_nblock_types;
 
      // copy to w
      unsigned block_offset = 0;
      const double* w_pt = w.values_pt();
      double* v_pt = v.values_pt();
      for (unsigned b = 0; b < nblock; b++)
      {
        unsigned block_nrow = this->internal_block_dimension(b);
        for (unsigned i = 0; i < block_nrow; i++)
        {
          v_pt[this->Global_index[b][i]] = w_pt[block_offset + i];
        }
        block_offset += block_nrow;
      }
    }
    // otherwise use mpi
    else
    {
#ifdef OOMPH_HAS_MPI
 
      // my rank
      unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
 
      // the number of processors
      unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
 
      // determine the maximum number of rows to be sent or recv
      unsigned max_n_send_or_recv = 0;
      for (unsigned p = 0; p < nproc; p++)
      {
        max_n_send_or_recv =
          std::max(max_n_send_or_recv, Nrows_to_send_for_get_ordered[p]);
        max_n_send_or_recv =
          std::max(max_n_send_or_recv, Nrows_to_recv_for_get_ordered[p]);
      }
 
      // create a vectors of 1s (the size of the nblock for the mpi indexed
      // data types
      int* block_lengths = new int[max_n_send_or_recv];
      for (unsigned i = 0; i < max_n_send_or_recv; i++)
      {
        block_lengths[i] = 1;
      }
 
      // perform the sends and receives
      Vector<MPI_Request> requests;
      for (unsigned p = 0; p < nproc; p++)
      {
        // send and recv with other processors
        if (p != my_rank)
        {
          if (Nrows_to_recv_for_get_ordered[p] > 0)
          {
            // create the send datatype
            MPI_Datatype type_send;
            MPI_Type_indexed(Nrows_to_recv_for_get_ordered[p],
                             block_lengths,
                             Rows_to_recv_for_get_ordered[p],
                             MPI_DOUBLE,
                             &type_send);
            MPI_Type_commit(&type_send);
 
            // send
            MPI_Request send_req;
            MPI_Isend(const_cast<double*>(w.values_pt()),
                      1,
                      type_send,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &send_req);
            MPI_Type_free(&type_send);
            requests.push_back(send_req);
          }
 
          if (Nrows_to_send_for_get_ordered[p] > 0)
          {
            // create the recv datatype
            MPI_Datatype type_recv;
            MPI_Type_indexed(Nrows_to_send_for_get_ordered[p],
                             block_lengths,
                             Rows_to_send_for_get_ordered[p],
                             MPI_DOUBLE,
                             &type_recv);
            MPI_Type_commit(&type_recv);
 
            // recv
            MPI_Request recv_req;
            MPI_Irecv(v.values_pt(),
                      1,
                      type_recv,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &recv_req);
            MPI_Type_free(&type_recv);
            requests.push_back(recv_req);
          }
        }
 
        // communicate wih self
        else
        {
          const double* w_values_pt = w.values_pt();
          double* v_values_pt = v.values_pt();
          for (unsigned i = 0; i < Nrows_to_send_for_get_ordered[p]; i++)
          {
            v_values_pt[Rows_to_send_for_get_ordered[p][i]] =
              w_values_pt[Rows_to_recv_for_get_ordered[p][i]];
          }
        }
      }
 
      // and then just wait
      unsigned c = requests.size();
      Vector<MPI_Status> stat(c);
      if (c)
      {
        MPI_Waitall(c, &requests[0], &stat[0]);
      }
      delete[] block_lengths;
 
#else
      // throw error
      std::ostringstream error_message;
      error_message << "The preconditioner is distributed and on more than one "
                    << "processor. MPI is required.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
#endif
    } // else use mpi
  } // function return_block_ordered_preconditioner_vector
 
 
  //============================================================================
  /// Takes the block ordered vector, w, and reorders it in natural
  /// order. Reordered vector is returned in v. Note: If the preconditioner is
  /// a subsidiary preconditioner then only the components of the vector
  /// associated with the blocks of the subsidiary preconditioner will be
  /// included. Hence the length of v is master_nrow() whereas that of the
  /// vector w is of length this->nrow().
  ///
  /// This is the return function for the function
  /// get_block_ordered_preconditioner_vector(...).
  ///
  /// It calls the function return_concatenated_block_vector(...) with the
  /// identity block number ordering.
  //============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::return_block_ordered_preconditioner_vector(
    const DoubleVector& w, DoubleVector& v) const
  {
#ifdef PARANOID
    if (!v.built())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must be setup.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (*v.distribution_pt() != *this->master_distribution_pt())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the global vector v must match the "
                    << " specified master_distribution_pt(). \n"
                    << "i.e. Distribution_pt in the master preconditioner";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (!w.built())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the block vector w must be setup.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
    if (*w.distribution_pt() != *this->preconditioner_matrix_distribution_pt())
    {
      std::ostringstream error_message;
      error_message << "The distribution of the block vector w must match the "
                    << "concatenations of distributions in "
                    << "Block_distribution_pt.\n";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    // Split into the block vectors.
    const unsigned nblocks = nblock_types();
    Vector<unsigned> block_vec_number(nblocks, 0);
    for (unsigned b = 0; b < nblocks; b++)
    {
      block_vec_number[b] = b;
    }
 
    return_concatenated_block_vector(block_vec_number, w, v);
  } // function return_block_ordered_preconditioner_vector
 
  //=============================================================================
  /// Gets block (i,j) from the matrix pointed to by
  /// Matrix_pt and returns it in output_block. This is associated with the
  /// internal blocks. Please use the other get_block(...) function.
  //=============================================================================
  template<>
  void BlockPreconditioner<CRDoubleMatrix>::internal_get_block(
    const unsigned& block_i,
    const unsigned& block_j,
    CRDoubleMatrix& output_block) const
  {
#ifdef PARANOID
    // the number of blocks
    const unsigned n_blocks = this->internal_nblock_types();
 
    // paranoid check that block i is in this block preconditioner
    if (block_i >= n_blocks || block_j >= n_blocks)
    {
      std::ostringstream error_message;
      error_message
        << "Requested block (" << block_i << "," << block_j
        << "), however this preconditioner has internal_nblock_types() "
        << "= " << internal_nblock_types() << std::endl;
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
 
    // Check that the matrix is the same as that of the master
    if (is_subsidiary_block_preconditioner())
    {
      if (master_block_preconditioner_pt()->matrix_pt() != matrix_pt())
      {
        std::string err = "Master and subs should have same matrix.";
        throw OomphLibError(
          err, OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
      }
    }
#endif
 
    // Cast the pointer
    CRDoubleMatrix* cr_matrix_pt = dynamic_cast<CRDoubleMatrix*>(matrix_pt());
 
    // if + only one processor
    //    + more than one processor but matrix_pt is not distributed
    // then use the serial get_block method
    if (cr_matrix_pt->distribution_pt()->communicator_pt()->nproc() == 1 ||
        !cr_matrix_pt->distribution_pt()->distributed())
    {
      // pointers for the jacobian matrix is compressed row sparse format
      int* j_row_start;
      int* j_column_index;
      double* j_value;
 
      // sets pointers to jacobian matrix
      j_row_start = cr_matrix_pt->row_start();
      j_column_index = cr_matrix_pt->column_index();
      j_value = cr_matrix_pt->value();
 
      // get the block dimensions
      unsigned block_nrow = this->internal_block_dimension(block_i);
      unsigned block_ncol = this->internal_block_dimension(block_j);
 
      // allocate temporary storage for the component vectors of block (i,j)
      // temp_ptr is used to point to an element in each column - required as
      // cannot assume that order of block's rows in jacobian and the block
      // matrix will be the same
      int* temp_row_start = new int[block_nrow + 1];
      for (unsigned i = 0; i <= block_nrow; i++)
      {
        temp_row_start[i] = 0;
      }
      Vector<int> temp_ptr(block_nrow + 1);
      int block_nnz = 0;
 
      // get number of rows in source matrix
      unsigned master_nrow = this->master_nrow();
 
      // determine how many non zeros there are in the block (i,j)
      // also determines how many non zeros are stored in each row or column -
      // stored in temp_ptr temporarily
      for (unsigned k = 0; k < master_nrow; k++)
      {
        if (internal_block_number(k) == static_cast<int>(block_i))
        {
          for (int l = j_row_start[k]; l < j_row_start[k + 1]; l++)
          {
            if (internal_block_number(j_column_index[l]) ==
                static_cast<int>(block_j))
            {
              block_nnz++;
              temp_ptr[internal_index_in_block(k) + 1]++;
            }
          }
        }
      }
 
      // if the matrix is not empty
      int* temp_column_index = new int[block_nnz];
      double* temp_value = new double[block_nnz];
      if (block_nnz > 0)
      {
        // uses number of elements in each column of block to determine values
        // for the block column start (temp_row_start)
        temp_row_start[0] = 0;
        for (unsigned k = 1; k <= block_nrow; k++)
        {
          temp_row_start[k] = temp_row_start[k - 1] + temp_ptr[k];
          temp_ptr[k] = temp_row_start[k];
        }
 
        // copies the relevant elements of the jacobian to the correct entries
        // of the block matrix
        for (unsigned k = 0; k < master_nrow; k++)
        {
          if (internal_block_number(k) == static_cast<int>(block_i))
          {
            for (int l = j_row_start[k]; l < j_row_start[k + 1]; l++)
            {
              if (internal_block_number(j_column_index[l]) ==
                  static_cast<int>(block_j))
              {
                int kk = temp_ptr[internal_index_in_block(k)]++;
                temp_value[kk] = j_value[l];
                temp_column_index[kk] =
                  internal_index_in_block(j_column_index[l]);
              }
            }
          }
        }
      }
 
 
      // Fill in the compressed row matrix ??ds Note: I kept the calls to
      // build as close as I could to before (had to replace new(dist) with
      // .build(dist) ).
      output_block.build(Internal_block_distribution_pt[block_i]);
      output_block.build_without_copy(
        block_ncol, block_nnz, temp_value, temp_column_index, temp_row_start);
 
#ifdef PARANOID
      // checks to see if block matrix has been set up correctly
      //   block_matrix_test(matrix_pt,block_i,block_j,block_pt);
      if (Run_block_matrix_test)
      {
        // checks to see if block matrix has been set up correctly
        block_matrix_test(block_i, block_j, &output_block);
      }
#endif
    }
 
 
    // otherwise we are dealing with a distributed matrix
    else
    {
#ifdef OOMPH_HAS_MPI
      // number of processors
      unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
 
      // my rank
      unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
 
      // sets pointers to jacobian matrix
      int* j_row_start = cr_matrix_pt->row_start();
      int* j_column_index = cr_matrix_pt->column_index();
      double* j_value = cr_matrix_pt->value();
 
      // number of non zeros in each row to be sent
      Vector<int*> nnz_send(nproc, 0);
 
      // number of non zeros in each row to be received
      Vector<int*> nnz_recv(nproc, 0);
 
      // storage for data to be sent
      Vector<int*> column_index_for_proc(nproc, 0);
      Vector<double*> values_for_proc(nproc, 0);
 
      // number of non zeros to be sent to each processor
      Vector<unsigned> total_nnz_send(nproc, 0);
 
      // number of rows of the block matrix on this processor
      unsigned nrow_local =
        Internal_block_distribution_pt[block_i]->nrow_local();
 
      // resize the nnz storage and compute nnz_send
      // and send and recv the nnz
      Vector<MPI_Request> send_req;
      Vector<MPI_Request> recv1_req;
      for (unsigned p = 0; p < nproc; p++)
      {
        int nrow_send = Nrows_to_send_for_get_block(block_i, p);
        int nrow_recv = Nrows_to_recv_for_get_block(block_i, p);
 
        // assemble nnz recv
        nnz_recv[p] = new int[nrow_recv];
 
        // assemble the storage to send
        if (nrow_send > 0 && p != my_rank)
        {
          nnz_send[p] = new int[nrow_send];
        }
 
        // compute the number of nnzs in each row and the total number
        // of nnzs
        for (int i = 0; i < nrow_send; i++)
        {
          unsigned row = Rows_to_send_for_get_block(block_i, p)[i];
          int c = 0;
          for (int r = j_row_start[row]; r < j_row_start[row + 1]; r++)
          {
            if (internal_block_number(j_column_index[r]) == int(block_j))
            {
              c++;
            }
          }
          if (p != my_rank)
          {
            nnz_send[p][i] = c;
          }
          else
          {
            nnz_recv[p][i] = c;
          }
          total_nnz_send[p] += c;
        }
 
        // send
        if (p != my_rank)
        {
          if (nrow_send)
          {
            MPI_Request req;
            MPI_Isend(nnz_send[p],
                      nrow_send,
                      MPI_INT,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &req);
            send_req.push_back(req);
          }
 
          // recv
          if (nrow_recv)
          {
            MPI_Request req;
            MPI_Irecv(nnz_recv[p],
                      nrow_recv,
                      MPI_INT,
                      p,
                      0,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &req);
            recv1_req.push_back(req);
          }
        }
      }
 
      // next assemble the values and row_start data to be sent for each
      // processor
      for (unsigned p = 0; p < nproc; p++)
      {
        int nrow_send = Nrows_to_send_for_get_block(block_i, p);
 
        // assemble the storage for the values and column indices to be sent
        if (p != my_rank)
        {
          if (total_nnz_send[p] > 0)
          {
            values_for_proc[p] = new double[total_nnz_send[p]];
            column_index_for_proc[p] = new int[total_nnz_send[p]];
 
            // copy the values and column indices to the storage
            unsigned ptr = 0;
            for (int i = 0; i < nrow_send; i++)
            {
              unsigned row = Rows_to_send_for_get_block(block_i, p)[i];
              for (int r = j_row_start[row]; r < j_row_start[row + 1]; r++)
              {
                if (internal_block_number(j_column_index[r]) == int(block_j))
                {
                  values_for_proc[p][ptr] = j_value[r];
                  column_index_for_proc[p][ptr] =
                    internal_index_in_block(j_column_index[r]);
                  ptr++;
                }
              }
            }
 
            // create the datatypes
            MPI_Datatype types[2];
            MPI_Type_contiguous(total_nnz_send[p], MPI_DOUBLE, &types[0]);
            MPI_Type_commit(&types[0]);
            MPI_Type_contiguous(total_nnz_send[p], MPI_INT, &types[1]);
            MPI_Type_commit(&types[1]);
 
            // get the start address of the vectors
            MPI_Aint displacement[2];
            MPI_Get_address(values_for_proc[p], &displacement[0]);
            MPI_Get_address(column_index_for_proc[p], &displacement[1]);
 
            // compute the displacements
            displacement[1] -= displacement[0];
            displacement[0] -= displacement[0];
 
            // compute the block lengths
            int length[2];
            length[0] = length[1] = 1;
 
            // build the struct data type
            MPI_Datatype final_type;
            MPI_Type_create_struct(2, length, displacement, types, &final_type);
            MPI_Type_commit(&final_type);
            MPI_Type_free(&types[0]);
            MPI_Type_free(&types[1]);
 
            // and send
            MPI_Request req;
            MPI_Isend(values_for_proc[p],
                      1,
                      final_type,
                      p,
                      1,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &req);
            send_req.push_back(req);
            MPI_Type_free(&final_type);
          }
        }
      }
 
      // wait for the recv to complete (the row_start recv which actually
      // contains the number of nnzs in each row)
      int c_recv = recv1_req.size();
      if (c_recv != 0)
      {
        MPI_Waitall(c_recv, &recv1_req[0], MPI_STATUS_IGNORE);
      }
 
      // compute the total number of nnzs to be received
      Vector<int> total_nnz_recv_from_proc(nproc);
      int local_block_nnz = 0;
      for (unsigned p = 0; p < nproc; p++)
      {
        // compute the total nnzs
        for (unsigned i = 0; i < Nrows_to_recv_for_get_block(block_i, p); i++)
        {
          total_nnz_recv_from_proc[p] += nnz_recv[p][i];
        }
        local_block_nnz += total_nnz_recv_from_proc[p];
      }
 
      // compute the offset for each block of nnzs (a matrix row) in the
      // values_recv and column_index_recv vectors
 
      // fisrt determine how many blocks of rows are to be recv
      Vector<int> n_recv_block(nproc, 0);
      for (unsigned p = 0; p < nproc; p++)
      {
        if (Nrows_to_recv_for_get_block(block_i, p) > 0)
        {
          n_recv_block[p] = 1;
        }
        for (unsigned i = 1; i < Nrows_to_recv_for_get_block(block_i, p); i++)
        {
          if (Rows_to_recv_for_get_block(block_i, p)[i] !=
              Rows_to_recv_for_get_block(block_i, p)[i - 1] + 1)
          {
            n_recv_block[p]++;
          }
        }
      }
 
      // next assemble row start recv
      int* row_start_recv = new int[nrow_local + 1];
      for (unsigned i = 0; i <= nrow_local; i++)
      {
        row_start_recv[i] = 0;
      }
      for (unsigned p = 0; p < nproc; p++)
      {
        for (unsigned i = 0; i < Nrows_to_recv_for_get_block(block_i, p); i++)
        {
          row_start_recv[Rows_to_recv_for_get_block(block_i, p)[i]] =
            nnz_recv[p][i];
        }
      }
      int g = row_start_recv[0];
      row_start_recv[0] = 0;
      for (unsigned i = 1; i < nrow_local; i++)
      {
        int temp_g = g;
        g = row_start_recv[i];
        row_start_recv[i] = row_start_recv[i - 1] + temp_g;
      }
      row_start_recv[nrow_local] = row_start_recv[nrow_local - 1] + g;
 
      // next assemble the offset and the number of nzs in each recv block
      Vector<int*> offset_recv_block(nproc, 0);
      Vector<int*> nnz_recv_block(nproc, 0);
      for (unsigned p = 0; p < nproc; p++)
      {
        if (Nrows_to_recv_for_get_block(block_i, p) > 0)
        {
          offset_recv_block[p] = new int[n_recv_block[p]];
          offset_recv_block[p][0] = 0;
          nnz_recv_block[p] = new int[n_recv_block[p]];
          for (int i = 0; i < n_recv_block[p]; i++)
          {
            nnz_recv_block[p][i] = 0;
          }
          unsigned ptr = 0;
          nnz_recv_block[p][ptr] += nnz_recv[p][0];
          offset_recv_block[p][0] =
            row_start_recv[Rows_to_recv_for_get_block(block_i, p)[0]];
          for (unsigned i = 1; i < Nrows_to_recv_for_get_block(block_i, p); i++)
          {
            if (Rows_to_recv_for_get_block(block_i, p)[i] !=
                Rows_to_recv_for_get_block(block_i, p)[i - 1] + 1)
            {
              ptr++;
              offset_recv_block[p][ptr] =
                row_start_recv[Rows_to_recv_for_get_block(block_i, p)[i]];
            }
            nnz_recv_block[p][ptr] += nnz_recv[p][i];
          }
        }
        delete[] nnz_recv[p];
      }
 
      // post the receives
      int* column_index_recv = new int[local_block_nnz];
      double* values_recv = new double[local_block_nnz];
      Vector<MPI_Request> recv2_req;
      for (unsigned p = 0; p < nproc; p++)
      {
        if (p != my_rank)
        {
          if (total_nnz_recv_from_proc[p] != 0)
          {
            // create the datatypes
            MPI_Datatype types[2];
            MPI_Type_indexed(n_recv_block[p],
                             nnz_recv_block[p],
                             offset_recv_block[p],
                             MPI_DOUBLE,
                             &types[0]);
            MPI_Type_commit(&types[0]);
            MPI_Type_indexed(n_recv_block[p],
                             nnz_recv_block[p],
                             offset_recv_block[p],
                             MPI_INT,
                             &types[1]);
            MPI_Type_commit(&types[1]);
 
            // compute the displacements
            MPI_Aint displacements[2];
            MPI_Get_address(values_recv, &displacements[0]);
            MPI_Get_address(column_index_recv, &displacements[1]);
            displacements[1] -= displacements[0];
            displacements[0] -= displacements[0];
 
            // compute the block lengths
            int length[2];
            length[0] = length[1] = 1;
 
            // create the final datatype
            MPI_Datatype final_type;
            MPI_Type_create_struct(
              2, length, displacements, types, &final_type);
            MPI_Type_commit(&final_type);
            MPI_Type_free(&types[0]);
            MPI_Type_free(&types[1]);
 
            // and the recv
            MPI_Request req;
            MPI_Irecv(values_recv,
                      1,
                      final_type,
                      p,
                      1,
                      this->distribution_pt()->communicator_pt()->mpi_comm(),
                      &req);
            recv2_req.push_back(req);
            MPI_Type_free(&final_type);
          }
        }
        else
        {
          // next send the values and column indices to self
          unsigned block_ptr = 0;
          unsigned counter = 0;
          int nrow_send = Nrows_to_send_for_get_block(block_i, my_rank);
          if (nrow_send > 0)
          {
            unsigned offset = offset_recv_block[my_rank][0];
            for (int i = 0; i < nrow_send; i++)
            {
              if (i > 0)
              {
                if (Rows_to_recv_for_get_block(block_i, p)[i] !=
                    Rows_to_recv_for_get_block(block_i, p)[i - 1] + 1)
                {
                  counter = 0;
                  block_ptr++;
                  offset = offset_recv_block[my_rank][block_ptr];
                }
              }
              unsigned row = Rows_to_send_for_get_block(block_i, my_rank)[i];
              for (int r = j_row_start[row]; r < j_row_start[row + 1]; r++)
              {
                if (internal_block_number(j_column_index[r]) == int(block_j))
                {
                  values_recv[offset + counter] = j_value[r];
                  column_index_recv[offset + counter] =
                    internal_index_in_block(j_column_index[r]);
                  counter++;
                }
              }
            }
          }
        }
      }
 
      // wait for the recv to complete (for the column_index and the values_
      c_recv = recv2_req.size();
      if (c_recv != 0)
      {
        MPI_Waitall(c_recv, &recv2_req[0], MPI_STATUS_IGNORE);
      }
 
      // Fill in the compressed row matrix
      output_block.build(Internal_block_distribution_pt[block_i]);
      output_block.build_without_copy(this->internal_block_dimension(block_j),
                                      local_block_nnz,
                                      values_recv,
                                      column_index_recv,
                                      row_start_recv);
 
      // wait for the send to complete (nnz / row_start)
      int c_send = send_req.size();
      if (c_send)
      {
        MPI_Waitall(c_send, &send_req[0], MPI_STATUS_IGNORE);
      }
 
      // delete temp storage used for assembling data for communication
      for (unsigned p = 0; p < nproc; p++)
      {
        delete[] nnz_send[p];
        delete[] column_index_for_proc[p];
        delete[] values_for_proc[p];
        delete[] offset_recv_block[p];
        delete[] nnz_recv_block[p];
      }
#else
      // throw error
      std::ostringstream error_message;
      error_message << "The matrix is distributed and on more than one "
                    << "processor. MPI is required.";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
#endif
    }
  }
 
  //=============================================================================
  /// Gets dof-level block (i,j).
  /// If Replacement_dof_block_pt(i,j) is not null, then the replacement
  /// block is returned via a deep copy.
  ///
  /// Otherwise if this is the uppermost block preconditioner then it calls
  /// internal_get_block(i,j), else if it is a subsidiary
  /// block preconditioner, it will call it's master block preconditioners'
  /// get_dof_level_block function.
  //=============================================================================
  template<>
  void BlockPreconditioner<CRDoubleMatrix>::get_dof_level_block(
    const unsigned& block_i,
    const unsigned& block_j,
    CRDoubleMatrix& output_block,
    const bool& ignore_replacement_block) const
  {
#ifdef PARANOID
    // the number of dof types.
    unsigned para_ndofs = ndof_types();
 
    // paranoid check that block i is in this block preconditioner
    if (block_i >= para_ndofs || block_j >= para_ndofs)
    {
      std::ostringstream err_msg;
      err_msg << "Requested dof block (" << block_i << "," << block_j
              << "), however this preconditioner has ndof_types() "
              << "= " << para_ndofs << std::endl;
      throw OomphLibError(
        err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
#endif
 
    CRDoubleMatrix* tmp_block_pt =
      Replacement_dof_block_pt.get(block_i, block_j);
 
    if ((tmp_block_pt == 0) || ignore_replacement_block)
    {
      // Getting the block from parent preconditioner
      const unsigned ndof_in_parent_i =
        Doftype_coarsen_map_coarse[block_i].size();
      const unsigned ndof_in_parent_j =
        Doftype_coarsen_map_coarse[block_j].size();
 
      if (ndof_in_parent_i == 1 && ndof_in_parent_j == 1)
      {
        unsigned parent_dof_i = Doftype_coarsen_map_coarse[block_i][0];
        unsigned parent_dof_j = Doftype_coarsen_map_coarse[block_j][0];
 
        if (is_master_block_preconditioner())
        {
          internal_get_block(parent_dof_i, parent_dof_j, output_block);
        }
        else
        {
          parent_dof_i = Doftype_in_master_preconditioner_coarse[parent_dof_i];
          parent_dof_j = Doftype_in_master_preconditioner_coarse[parent_dof_j];
 
          master_block_preconditioner_pt()->get_dof_level_block(
            parent_dof_i, parent_dof_j, output_block, ignore_replacement_block);
        }
      }
      else
      {
        DenseMatrix<CRDoubleMatrix*> tmp_blocks_pt(
          ndof_in_parent_i, ndof_in_parent_j, 0);
 
        Vector<Vector<unsigned>> new_block(
          ndof_in_parent_i, Vector<unsigned>(ndof_in_parent_j, 0));
 
        for (unsigned dof_i = 0; dof_i < ndof_in_parent_i; dof_i++)
        {
          unsigned parent_dof_i = Doftype_coarsen_map_coarse[block_i][dof_i];
          if (is_subsidiary_block_preconditioner())
          {
            parent_dof_i =
              Doftype_in_master_preconditioner_coarse[parent_dof_i];
          }
 
          for (unsigned dof_j = 0; dof_j < ndof_in_parent_j; dof_j++)
          {
            unsigned parent_dof_j = Doftype_coarsen_map_coarse[block_j][dof_j];
 
            tmp_blocks_pt(dof_i, dof_j) = new CRDoubleMatrix;
 
            new_block[dof_i][dof_j] = 1;
 
            if (is_master_block_preconditioner())
            {
              internal_get_block(
                parent_dof_i, parent_dof_j, *tmp_blocks_pt(dof_i, dof_j));
            }
            else
            {
              parent_dof_j =
                Doftype_in_master_preconditioner_coarse[parent_dof_j];
 
              master_block_preconditioner_pt()->get_dof_level_block(
                parent_dof_i,
                parent_dof_j,
                *tmp_blocks_pt(dof_i, dof_j),
                ignore_replacement_block);
            }
          }
        }
 
        Vector<LinearAlgebraDistribution*> tmp_row_dist_pt(ndof_in_parent_i, 0);
 
        for (unsigned parent_dof_i = 0; parent_dof_i < ndof_in_parent_i;
             parent_dof_i++)
        {
          unsigned mapped_dof_i =
            Doftype_coarsen_map_coarse[block_i][parent_dof_i];
 
          if (is_master_block_preconditioner())
          {
            tmp_row_dist_pt[parent_dof_i] =
              Internal_block_distribution_pt[mapped_dof_i];
          }
          else
          {
            mapped_dof_i =
              Doftype_in_master_preconditioner_coarse[mapped_dof_i];
 
            tmp_row_dist_pt[parent_dof_i] =
              master_block_preconditioner_pt()->dof_block_distribution_pt(
                mapped_dof_i);
          }
        }
 
        Vector<LinearAlgebraDistribution*> tmp_col_dist_pt(ndof_in_parent_j, 0);
 
        for (unsigned parent_dof_j = 0; parent_dof_j < ndof_in_parent_j;
             parent_dof_j++)
        {
          unsigned mapped_dof_j =
            Doftype_coarsen_map_coarse[block_j][parent_dof_j];
 
          if (is_master_block_preconditioner())
          {
            tmp_col_dist_pt[parent_dof_j] =
              Internal_block_distribution_pt[mapped_dof_j];
          }
          else
          {
            mapped_dof_j =
              Doftype_in_master_preconditioner_coarse[mapped_dof_j];
            tmp_col_dist_pt[parent_dof_j] =
              master_block_preconditioner_pt()->dof_block_distribution_pt(
                mapped_dof_j);
          }
        }
 
        CRDoubleMatrixHelpers::concatenate_without_communication(
          tmp_row_dist_pt, tmp_col_dist_pt, tmp_blocks_pt, output_block);
 
        for (unsigned dof_i = 0; dof_i < ndof_in_parent_i; dof_i++)
        {
          for (unsigned dof_j = 0; dof_j < ndof_in_parent_j; dof_j++)
          {
            if (new_block[dof_i][dof_j])
            {
              delete tmp_blocks_pt(dof_i, dof_j);
            }
          }
        }
      }
    }
    else
    {
      CRDoubleMatrixHelpers::deep_copy(tmp_block_pt, output_block);
    }
  }
 
  //=============================================================================
  /// test function to check that every element in the block matrix
  /// (block_i,block_j) matches the corresponding element in the original matrix
  //=============================================================================
  template<typename MATRIX>
  void BlockPreconditioner<MATRIX>::block_matrix_test(
    const unsigned& block_i,
    const unsigned& block_j,
    const MATRIX* block_matrix_pt) const
  {
    // boolean flag to indicate whether test is passed
    bool check = true;
 
    // number of rows in matrix
    unsigned n_row = matrix_pt()->nrow();
 
    // number of columns in matrix
    unsigned n_col = matrix_pt()->ncol();
 
    // loop over rows of original matrix
    for (unsigned i = 0; i < n_row; i++)
    {
      // if this coefficient is associated with a block in this block
      // preconditioner
      if (static_cast<int>(block_i) == this->internal_block_number(i))
      {
        // loop over columns of original matrix
        for (unsigned j = 0; j < n_col; j++)
        {
          // if the coeeficient is associated with a block in this block
          // preconditioner
          if (static_cast<int>(block_j) == this->internal_block_number(j))
          {
            // check whether elements in original matrix and matrix of block
            // pointers match
            if (matrix_pt()->operator()(i, j) !=
                block_matrix_pt->operator()(internal_index_in_block(i),
                                            internal_index_in_block(j)))
            {
              check = false;
            }
          }
        }
      }
    }
 
    // throw error
    if (!check)
    {
      std::ostringstream error_message;
      error_message << "The require elements have not been successfully copied"
                    << " from the original matrix to the block matrices";
      throw OomphLibError(
        error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
    }
  }
 
 
  template class BlockPreconditioner<CRDoubleMatrix>;
 
} // namespace oomph