diff --git a/firedrake/assemble.py b/firedrake/assemble.py
index 5db7c007ce..2992fad9d9 100644
--- a/firedrake/assemble.py
+++ b/firedrake/assemble.py
@@ -364,7 +364,9 @@ def allocate(self):
             else:
                 test, trial = self._form.arguments()
                 sparsity = ExplicitMatrixAssembler._make_sparsity(test, trial, self._mat_type, self._sub_mat_type, self.maps_and_regions)
-                return matrix.Matrix(self._form, self._bcs, self._mat_type, sparsity, ScalarType, options_prefix=self._options_prefix)
+                return matrix.Matrix(self._form, self._bcs, self._mat_type, sparsity, ScalarType,
+                                     sub_mat_type=self._sub_mat_type,
+                                     options_prefix=self._options_prefix)
         else:
             raise NotImplementedError("Only implemented for rank = 2 and diagonal = False")
 
@@ -1324,12 +1326,12 @@ def _get_mat_type(mat_type, sub_mat_type, arguments):
                for arg in arguments
                for V in arg.function_space()):
             mat_type = "nest"
-    if mat_type not in {"matfree", "aij", "baij", "nest", "dense"}:
+    if mat_type not in {"matfree", "aij", "baij", "nest", "dense", "is"}:
         raise ValueError(f"Unrecognised matrix type, '{mat_type}'")
     if sub_mat_type is None:
         sub_mat_type = parameters.parameters["default_sub_matrix_type"]
-    if sub_mat_type not in {"aij", "baij"}:
-        raise ValueError(f"Invalid submatrix type, '{sub_mat_type}' (not 'aij' or 'baij')")
+    if sub_mat_type not in {"aij", "baij", "is"}:
+        raise ValueError(f"Invalid submatrix type, '{sub_mat_type}' (not 'aij', 'baij', or 'is')")
     return mat_type, sub_mat_type
 
 
@@ -1373,6 +1375,7 @@ def allocate(self):
                                                           self._sub_mat_type,
                                                           self._make_maps_and_regions())
         return matrix.Matrix(self._form, self._bcs, self._mat_type, sparsity, ScalarType,
+                             sub_mat_type=self._sub_mat_type,
                              options_prefix=self._options_prefix,
                              fc_params=self._form_compiler_params)
 
@@ -1490,8 +1493,10 @@ def _apply_bc(self, tensor, bc, u=None):
             # Set diagonal entries on bc nodes to 1 if the current
             # block is on the matrix diagonal and its index matches the
             # index of the function space the bc is defined on.
+            if op2tensor.handle.getType() == "is":
+                # Flag the entire matrix as assembled before indexing the diagonal block
+                op2tensor.handle.assemble()
             op2tensor[index, index].set_local_diagonal_entries(bc.nodes, idx=component, diag_val=self.weight)
-
             # Handle off-diagonal block involving real function space.
             # "lgmaps" is correctly constructed in _matrix_arg, but
             # is ignored by PyOP2 in this case.
@@ -2067,16 +2072,18 @@ def collect_lgmaps(self):
                 row_bcs, col_bcs = self._filter_bcs(i, j)
                 # the tensor is already indexed
                 rlgmap, clgmap = self._tensor.local_to_global_maps
-                rlgmap = self.test_function_space[i].local_to_global_map(row_bcs, rlgmap)
-                clgmap = self.trial_function_space[j].local_to_global_map(col_bcs, clgmap)
+                mat_type = self._tensor.handle.getType()
+                rlgmap = self.test_function_space[i].local_to_global_map(row_bcs, rlgmap, mat_type=mat_type)
+                clgmap = self.trial_function_space[j].local_to_global_map(col_bcs, clgmap, mat_type=mat_type)
                 return ((rlgmap, clgmap),)
             else:
                 lgmaps = []
                 for i, j in self.get_indicess():
                     row_bcs, col_bcs = self._filter_bcs(i, j)
                     rlgmap, clgmap = self._tensor[i, j].local_to_global_maps
-                    rlgmap = self.test_function_space[i].local_to_global_map(row_bcs, rlgmap)
-                    clgmap = self.trial_function_space[j].local_to_global_map(col_bcs, clgmap)
+                    mat_type = self._tensor[i, j].handle.getType()
+                    rlgmap = self.test_function_space[i].local_to_global_map(row_bcs, rlgmap, mat_type=mat_type)
+                    clgmap = self.trial_function_space[j].local_to_global_map(col_bcs, clgmap, mat_type=mat_type)
                     lgmaps.append((rlgmap, clgmap))
                 return tuple(lgmaps)
         else:
diff --git a/firedrake/functionspaceimpl.py b/firedrake/functionspaceimpl.py
index 6dd1aebbdc..7136fa9a3a 100644
--- a/firedrake/functionspaceimpl.py
+++ b/firedrake/functionspaceimpl.py
@@ -856,10 +856,31 @@ def boundary_nodes(self, sub_domain):
         return self._shared_data.boundary_nodes(self, sub_domain)
 
     @PETSc.Log.EventDecorator()
-    def local_to_global_map(self, bcs, lgmap=None):
+    def local_to_global_map(self, bcs, lgmap=None, mat_type=None):
         r"""Return a map from process local dof numbering to global dof numbering.
 
-        If BCs is provided, mask out those dofs which match the BC nodes."""
+        Parameters
+        ----------
+        bcs: [firedrake.bcs.BCBase]
+            If provided, mask out those dofs which match the BC nodes.
+        lgmap: PETSc.LGMap
+            The base local-to-global map, which might be partially masked.
+        mat_type: str
+            The matrix assembly type. This is required as different matrix types
+            handle the LGMap differently for MixedFunctionSpace.
+
+        Note
+        ----
+            For a :func:`.VectorFunctionSpace` or :func:`.TensorFunctionSpace` the returned
+            LGMap will be the scalar one, unless the bcs are imposed on a particular component.
+            For a :class:`MixedFunctionSpace` the returned LGMap is unblocked,
+            unless mat_type == "is".
+
+        Returns
+        -------
+        PETSc.LGMap
+            A local-to-global map with masked BC dofs.
+        """
         # Caching these things is too complicated, since it depends
         # not just on the bcs, but also the parent space, and anything
         # this space has been recursively split out from [e.g. inside
@@ -884,10 +905,16 @@ def local_to_global_map(self, bcs, lgmap=None):
                 bsize = lgmap.getBlockSize()
                 assert bsize == self.block_size
         else:
-            # MatBlock case, LGMap is already unrolled.
-            indices = lgmap.block_indices.copy()
+            # MatBlock case, the LGMap is implementation dependent
             bsize = lgmap.getBlockSize()
-            unblocked = True
+            assert bsize == self.block_size
+            if mat_type == "is":
+                indices = lgmap.indices.copy()
+                unblocked = False
+            else:
+                # LGMap is already unrolled
+                indices = lgmap.block_indices.copy()
+                unblocked = True
         nodes = []
         for bc in bcs:
             if bc.function_space().component is not None:
@@ -997,7 +1024,7 @@ def __hash__(self):
         return hash((self.mesh(), self.dof_dset, self.ufl_element(),
                      self.boundary_set))
 
-    def local_to_global_map(self, bcs, lgmap=None):
+    def local_to_global_map(self, bcs, lgmap=None, mat_type=None):
         return lgmap or self.dof_dset.lgmap
 
     def collapse(self):
@@ -1204,7 +1231,7 @@ def exterior_facet_node_map(self):
         function space nodes."""
         return op2.MixedMap(s.exterior_facet_node_map() for s in self)
 
-    def local_to_global_map(self, bcs):
+    def local_to_global_map(self, bcs, lgmap=None, mat_type=None):
         r"""Return a map from process local dof numbering to global dof numbering.
 
         If BCs is provided, mask out those dofs which match the BC nodes."""
@@ -1452,7 +1479,7 @@ def top_nodes(self):
         ":class:`RealFunctionSpace` objects have no bottom nodes."
         return None
 
-    def local_to_global_map(self, bcs, lgmap=None):
+    def local_to_global_map(self, bcs, lgmap=None, mat_type=None):
         assert len(bcs) == 0
         return None
 
diff --git a/firedrake/preconditioners/bddc.py b/firedrake/preconditioners/bddc.py
index 8800b77d79..24e47320f2 100644
--- a/firedrake/preconditioners/bddc.py
+++ b/firedrake/preconditioners/bddc.py
@@ -4,8 +4,13 @@
 from firedrake.petsc import PETSc
 from firedrake.dmhooks import get_function_space, get_appctx
 from firedrake.ufl_expr import TestFunction, TrialFunction
-from ufl import curl, div, HCurl, HDiv, inner, dx
+from firedrake.function import Function
+from firedrake.functionspace import FunctionSpace, VectorFunctionSpace, TensorFunctionSpace
+from firedrake.preconditioners.fdm import tabulate_exterior_derivative
+from firedrake.preconditioners.hiptmair import curl_to_grad
+from ufl import H1, H2, inner, dx, JacobianDeterminant
 from pyop2.utils import as_tuple
+import gem
 import numpy
 
 __all__ = ("BDDCPC",)
@@ -22,17 +27,13 @@ class BDDCPC(PCBase):
     - ``'bddc_pc_bddc_dirichlet'`` to set sub-KSPs on subdomain interiors,
     - ``'bddc_pc_bddc_coarse'`` to set the coarse solver KSP.
 
-    This PC also inspects optional arguments supplied in the application context:
-    - ``'discrete_gradient'`` for problems in H(curl), this sets the arguments
-    (a Mat tabulating the gradient of the auxiliary H1 space) and
+    This PC also inspects optional callbacks supplied in the application context:
+    - ``'get_discrete_gradient'`` for 3D problems in H(curl), this is a callable that
+    provide the arguments (a Mat tabulating the gradient of the auxiliary H1 space) and
     keyword arguments supplied to ``PETSc.PC.setBDDCDiscreteGradient``.
-    - ``'divergence_mat'`` for 3D problems in H(div), this sets the Mat with the
-    assembled bilinear form testing the divergence against an L2 space.
-
-    Notes
-    -----
-    Currently the Mat type IS is only supported by FDMPC.
-
+    - ``'get_divergence_mat'`` for problems in H(div) (resp. 2D H(curl)), this is
+    provide the arguments (a Mat with the assembled bilinear form testing the divergence
+    (curl) against an L2 space) and keyword arguments supplied to ``PETSc.PC.setDivergenceMat``.
     """
 
     _prefix = "bddc_"
@@ -44,6 +45,8 @@ def initialize(self, pc):
         self.prefix = (pc.getOptionsPrefix() or "") + self._prefix
 
         V = get_function_space(dm)
+        variant = V.ufl_element().variant()
+        sobolev_space = V.ufl_element().sobolev_space
 
         # Create new PC object as BDDC type
         bddcpc = PETSc.PC().create(comm=pc.comm)
@@ -53,13 +56,17 @@ def initialize(self, pc):
         bddcpc.setType(PETSc.PC.Type.BDDC)
 
         opts = PETSc.Options(bddcpc.getOptionsPrefix())
-        if V.ufl_element().variant() == "fdm" and "pc_bddc_use_local_mat_graph" not in opts:
-            # Disable computation of disconected components of subdomain interfaces
+        # Do not use CSR of local matrix to define dofs connectivity unless requested
+        # Using the CSR only makes sense for H1/H2 problems
+        is_h1h2 = sobolev_space in [H1, H2]
+        if "pc_bddc_use_local_mat_graph" not in opts and (not is_h1h2 or not is_lagrange(V.finat_element)):
             opts["pc_bddc_use_local_mat_graph"] = False
 
+        # Handle boundary dofs
         ctx = get_appctx(dm)
-        bcs = tuple(ctx._problem.bcs)
-        if V.extruded:
+        bcs = tuple(ctx._problem.dirichlet_bcs())
+        mesh = V.mesh().unique()
+        if mesh.extruded and not mesh.extruded_periodic:
             boundary_nodes = numpy.unique(numpy.concatenate(list(map(V.boundary_nodes, ("on_boundary", "top", "bottom")))))
         else:
             boundary_nodes = V.boundary_nodes("on_boundary")
@@ -69,46 +76,44 @@ def initialize(self, pc):
             dir_nodes = numpy.unique(numpy.concatenate([bcdofs(bc, ghost=False) for bc in bcs]))
         neu_nodes = numpy.setdiff1d(boundary_nodes, dir_nodes)
 
-        V.dof_dset.lgmap.apply(dir_nodes, result=dir_nodes)
+        dir_nodes = V.dof_dset.lgmap.apply(dir_nodes)
         dir_bndr = PETSc.IS().createGeneral(dir_nodes, comm=pc.comm)
         bddcpc.setBDDCDirichletBoundaries(dir_bndr)
 
-        V.dof_dset.lgmap.apply(neu_nodes, result=neu_nodes)
+        neu_nodes = V.dof_dset.lgmap.apply(neu_nodes)
         neu_bndr = PETSc.IS().createGeneral(neu_nodes, comm=pc.comm)
         bddcpc.setBDDCNeumannBoundaries(neu_bndr)
 
         appctx = self.get_appctx(pc)
-        sobolev_space = V.ufl_element().sobolev_space
+        degree = max(as_tuple(V.ufl_element().degree()))
+
+        # Set coordinates only if corner selection is requested
+        # There's no API to query from PC
+        if "pc_bddc_corner_selection" in opts:
+            W = VectorFunctionSpace(V.mesh(), "Lagrange", degree, variant=variant)
+            coords = Function(W).interpolate(V.mesh().coordinates)
+            bddcpc.setCoordinates(coords.dat.data_ro.repeat(V.block_size, axis=0))
 
         tdim = V.mesh().topological_dimension
-        degree = max(as_tuple(V.ufl_element().degree()))
         if tdim >= 2 and V.finat_element.formdegree == tdim-1:
-            B = appctx.get("divergence_mat", None)
-            if B is None:
-                from firedrake.assemble import assemble
-                d = {HCurl: curl, HDiv: div}[sobolev_space]
-                Q = V.reconstruct(family="DG", degree=degree-1)
-                b = inner(d(TrialFunction(V)), TestFunction(Q)) * dx(degree=2*(degree-1))
-                B = assemble(b, mat_type="matfree")
-            bddcpc.setBDDCDivergenceMat(B.petscmat)
-        elif sobolev_space == HCurl:
-            gradient = appctx.get("discrete_gradient", None)
-            if gradient is None:
-                from firedrake.preconditioners.fdm import tabulate_exterior_derivative
-                from firedrake.preconditioners.hiptmair import curl_to_grad
-                Q = V.reconstruct(element=curl_to_grad(V.ufl_element()))
-                gradient = tabulate_exterior_derivative(Q, V)
-                corners = get_vertex_dofs(Q)
-                gradient.compose('_elements_corners', corners)
+            allow_repeated = P.getISAllowRepeated()
+            get_divergence = appctx.get("get_divergence_mat", get_divergence_mat)
+            divergence = get_divergence(V, mat_type="is", allow_repeated=allow_repeated)
+            try:
+                div_args, div_kwargs = divergence
+            except ValueError:
+                div_args = (divergence,)
+                div_kwargs = dict()
+            bddcpc.setBDDCDivergenceMat(*div_args, **div_kwargs)
+
+        elif tdim >= 3 and V.finat_element.formdegree == 1:
+            get_gradient = appctx.get("get_discrete_gradient", get_discrete_gradient)
+            gradient = get_gradient(V)
+            try:
+                grad_args, grad_kwargs = gradient
+            except ValueError:
                 grad_args = (gradient,)
-                grad_kwargs = {'order': degree}
-            else:
-                try:
-                    grad_args, grad_kwargs = gradient
-                except ValueError:
-                    grad_args = (gradient,)
-                    grad_kwargs = dict()
-
+                grad_kwargs = dict()
             bddcpc.setBDDCDiscreteGradient(*grad_args, **grad_kwargs)
 
         bddcpc.setFromOptions()
@@ -127,9 +132,67 @@ def applyTranspose(self, pc, x, y):
         self.pc.applyTranspose(x, y)
 
 
-def get_vertex_dofs(V):
-    W = V.reconstruct(element=restrict(V.ufl_element(), "vertex"))
+def get_restricted_dofs(V, domain):
+    W = FunctionSpace(V.mesh(), restrict(V.ufl_element(), domain))
     indices = get_restriction_indices(V, W)
-    V.dof_dset.lgmap.apply(indices, result=indices)
-    vertex_dofs = PETSc.IS().createGeneral(indices, comm=V.comm)
-    return vertex_dofs
+    indices = V.dof_dset.lgmap.apply(indices)
+    return PETSc.IS().createGeneral(indices, comm=V.comm)
+
+
+def get_divergence_mat(V, mat_type="is", allow_repeated=False):
+    from firedrake import assemble
+    degree = max(as_tuple(V.ufl_element().degree()))
+    Q = TensorFunctionSpace(V.mesh(), "DG", 0, variant=f"integral({degree-1})", shape=V.value_shape[:-1])
+    B = tabulate_exterior_derivative(V, Q, mat_type=mat_type, allow_repeated=allow_repeated)
+
+    Jdet = JacobianDeterminant(V.mesh())
+    s = assemble(inner(TrialFunction(Q)*(1/Jdet), TestFunction(Q))*dx(degree=0), diagonal=True)
+    with s.dat.vec as svec:
+        B.diagonalScale(svec, None)
+    return (B,), {}
+
+
+def get_discrete_gradient(V):
+    from firedrake import Constant
+    from firedrake.nullspace import VectorSpaceBasis
+
+    Q = FunctionSpace(V.mesh(), curl_to_grad(V.ufl_element()))
+    gradient = tabulate_exterior_derivative(Q, V)
+    basis = Function(Q)
+    try:
+        basis.interpolate(Constant(1))
+    except NotImplementedError:
+        basis.project(Constant(1))
+    nsp = VectorSpaceBasis([basis])
+    nsp.orthonormalize()
+    gradient.setNullSpace(nsp.nullspace())
+    if not is_lagrange(Q.finat_element):
+        vdofs = get_restricted_dofs(Q, "vertex")
+        gradient.compose('_elements_corners', vdofs)
+
+    degree = max(as_tuple(Q.ufl_element().degree()))
+    grad_args = (gradient,)
+    grad_kwargs = {'order': degree}
+    return grad_args, grad_kwargs
+
+
+def is_lagrange(finat_element):
+    """Returns whether finat_element.dual_basis consists only of point evaluation dofs."""
+    try:
+        Q, ps = finat_element.dual_basis
+    except NotImplementedError:
+        return False
+    # Inspect the weight matrix
+    # Lagrange elements have gem.Delta as the only terminal nodes
+    children = [Q]
+    while children:
+        nodes = []
+        for c in children:
+            if isinstance(c, gem.Delta):
+                pass
+            elif isinstance(c, gem.gem.Terminal):
+                return False
+            else:
+                nodes.extend(c.children)
+        children = nodes
+    return True
diff --git a/firedrake/preconditioners/facet_split.py b/firedrake/preconditioners/facet_split.py
index f4ac80eea0..722319631f 100644
--- a/firedrake/preconditioners/facet_split.py
+++ b/firedrake/preconditioners/facet_split.py
@@ -1,7 +1,7 @@
 from functools import partial
 from mpi4py import MPI
 from pyop2 import op2, PermutedMap
-from finat.ufl import RestrictedElement, MixedElement, TensorElement, VectorElement
+from finat.ufl import BrokenElement, RestrictedElement, MixedElement, TensorElement, VectorElement
 from firedrake.petsc import PETSc
 from firedrake.preconditioners.base import PCBase
 from firedrake.bcs import restricted_function_space
@@ -209,7 +209,9 @@ def restrict(ele, restriction_domain):
     if isinstance(ele, VectorElement):
         return type(ele)(restrict(ele._sub_element, restriction_domain), dim=ele.num_sub_elements)
     elif isinstance(ele, TensorElement):
-        return type(ele)(restrict(ele._sub_element, restriction_domain), shape=ele._shape, symmetry=ele._symmety)
+        return type(ele)(restrict(ele._sub_element, restriction_domain), shape=ele._shape, symmetry=ele._symmetry)
+    elif restriction_domain == "broken":
+        return BrokenElement(ele)
     else:
         return RestrictedElement(ele, restriction_domain)
 
@@ -242,7 +244,7 @@ def restricted_dofs(celem, felem):
     cdofs = celem.entity_dofs()
     fdofs = felem.entity_dofs()
     for dim in sorted(cdofs):
-        for entity in cdofs[dim]:
+        for entity in sorted(cdofs[dim]):
             ndofs = len(cdofs[dim][entity])
             indices[cdofs[dim][entity]] = fdofs[dim][entity][:ndofs]
     return indices
@@ -251,6 +253,9 @@ def restricted_dofs(celem, felem):
 def get_restriction_indices(V, W):
     """Return the list of dofs in the space V such that W = V[indices].
     """
+    if V.cell_node_map() is W.cell_node_map():
+        return numpy.arange(V.dof_dset.layout_vec.getSizes()[0], dtype=PETSc.IntType)
+
     vdat = V.make_dat(val=numpy.arange(V.dof_count, dtype=PETSc.IntType))
     wdats = [Wsub.make_dat(val=numpy.full((Wsub.dof_count,), -1, dtype=PETSc.IntType)) for Wsub in W]
     wdat = wdats[0] if len(W) == 1 else op2.MixedDat(wdats)
diff --git a/firedrake/preconditioners/fdm.py b/firedrake/preconditioners/fdm.py
index 2fb8443676..3b43c989f2 100644
--- a/firedrake/preconditioners/fdm.py
+++ b/firedrake/preconditioners/fdm.py
@@ -8,7 +8,7 @@
                                            evaluate_dual,
                                            get_permutation_to_nodal_elements,
                                            cache_generate_code)
-from firedrake.preconditioners.facet_split import split_dofs, restricted_dofs
+from firedrake.preconditioners.facet_split import restrict, restricted_dofs, split_dofs
 from firedrake.formmanipulation import ExtractSubBlock
 from firedrake.functionspace import FunctionSpace, MixedFunctionSpace
 from firedrake.function import Function
@@ -40,33 +40,6 @@
 __all__ = ("FDMPC", "PoissonFDMPC")
 
 
-def broken_function(V, val):
-    W = FunctionSpace(V.mesh(), finat.ufl.BrokenElement(V.ufl_element()))
-    w = Function(W, dtype=val.dtype)
-    v = Function(V, val=val)
-    domain = "{[i]: 0 <= i < v.dofs}"
-    instructions = """
-    for i
-        w[i] = v[i]
-    end
-    """
-    par_loop((domain, instructions), ufl.dx, {'w': (w, op2.WRITE), 'v': (v, op2.READ)})
-    return w
-
-
-def mask_local_indices(V, lgmap, repeated=False):
-    mask = lgmap.indices
-    if repeated:
-        w = broken_function(V, mask)
-        V = w.function_space()
-        mask = w.dat.data_ro_with_halos
-    indices = numpy.arange(len(mask), dtype=PETSc.IntType)
-    indices[mask == -1] = -1
-    indices_dat = V.make_dat(val=indices)
-    indices_acc = indices_dat(op2.READ, V.cell_node_map())
-    return indices_acc
-
-
 class FDMPC(PCBase):
     """
     A preconditioner for tensor-product elements that changes the shape
@@ -83,7 +56,7 @@ class FDMPC(PCBase):
     matrices.
 
     The PETSc options inspected by this class are:
-    - 'fdm_mat_type': can be either 'aij' or 'sbaij'
+    - 'fdm_mat_type': can be either 'aij', 'sbaij', or 'is'
     - 'fdm_static_condensation': are we assembling the Schur complement on facets?
     """
 
@@ -134,7 +107,10 @@ def initialize(self, pc):
 
         # Transform the problem into the space with FDM shape functions
         V = J.arguments()[-1].function_space()
-        V_fdm = V.reconstruct(variant=self._variant)
+        if self._variant == "fdm" and V.ufl_element().variant() in {"fdm", "demkowicz", "demkowiczmass"}:
+            V_fdm = V
+        else:
+            V_fdm = V.reconstruct(variant=self._variant)
         if V == V_fdm:
             J_fdm, bcs_fdm = (J, bcs)
         else:
@@ -169,7 +145,7 @@ def initialize(self, pc):
                 self.bc_nodes = numpy.empty(0, dtype=PETSc.IntType)
 
         # Internally, we just set up a PC object that the user can configure
-        # however from the PETSc command line.  Since PC allows the user to specify
+        # however from the PETSc command line. Since PC allows the user to specify
         # a KSP, we can do iterative by -fdm_pc_type ksp.
         fdmpc = PETSc.PC().create(comm=pc.comm)
         fdmpc.incrementTabLevel(1, parent=pc)
@@ -218,15 +194,15 @@ def allocate_matrix(self, Amat, V, J, bcs, fcp, pmat_type, use_static_condensati
             Vfacet = None
             Vbig = V
             ebig = V.ufl_element()
-            _, fdofs = split_dofs(V.finat_element)
+            idofs, fdofs = split_dofs(V.finat_element)
         elif len(ifacet) == 1:
             Vfacet = V[ifacet[0]]
             ebig, = set(unrestrict_element(Vsub.ufl_element()) for Vsub in V)
             Vbig = V.reconstruct(mesh=V.mesh().unique(), element=ebig)
-            if len(V) > 1:
-                dims = [Vsub.finat_element.space_dimension() for Vsub in V]
-                assert sum(dims) == Vbig.finat_element.space_dimension()
+            space_dim = Vbig.finat_element.space_dimension()
+            assert space_dim == sum(Vsub.finat_element.space_dimension() for Vsub in V)
             fdofs = restricted_dofs(Vfacet.finat_element, Vbig.finat_element)
+            idofs = numpy.setdiff1d(numpy.arange(space_dim, dtype=fdofs.dtype), fdofs)
         else:
             raise ValueError("Expecting at most one FunctionSpace restricted onto facets.")
         self.embedding_element = ebig
@@ -245,7 +221,7 @@ def allocate_matrix(self, Amat, V, J, bcs, fcp, pmat_type, use_static_condensati
 
         # Dictionary with kernel to compute the Schur complement
         self.schur_kernel = {}
-        if V == Vbig and Vbig.finat_element.formdegree == 0:
+        if V == Vbig and Vbig.finat_element.formdegree == 0 and len(idofs) > 0 and pmat_type.endswith("aij"):
             # If we are in H(grad), we just pad with zeros on the statically-condensed pattern
             self.schur_kernel[V] = SchurComplementPattern
         elif Vfacet and use_static_condensation:
@@ -497,7 +473,7 @@ def assemble_coefficients(self, J, fcp, block_diagonal=False):
             dku = ufl.div(u) if sobolev == ufl.HDiv else ufl.curl(u)
             eps = expand_derivatives(ufl.diff(ufl.replace(expand_derivatives(dku), {ufl.grad(u): du}), du))
             if sobolev == ufl.HDiv:
-                map_grad = lambda p: ufl.outer(p, eps/tdim)
+                map_grad = lambda p: ufl.conj(ufl.outer(p, eps/tdim))
             elif len(eps.ufl_shape) == 3:
                 map_grad = lambda p: ufl.dot(p, eps/2)
             else:
@@ -710,65 +686,28 @@ def insert_mode(self):
     def assembly_lgmaps(self):
         if self.mat_type != "is":
             return {Vsub: Vsub.dof_dset.lgmap for Vsub in self.V}
-        lgmaps = {}
-        for Vsub in self.V:
-            lgmap = Vsub.dof_dset.lgmap
-            if self.allow_repeated:
-                indices = broken_function(Vsub, lgmap.indices).dat.data_ro
-            else:
-                indices = lgmap.indices.copy()
-                local_indices = numpy.arange(len(indices), dtype=PETSc.IntType)
-                cell_node_map = broken_function(Vsub, local_indices).dat.data_ro
-                ghost = numpy.setdiff1d(local_indices, numpy.unique(cell_node_map), assume_unique=True)
-                indices[ghost] = -1
-            lgmaps[Vsub] = PETSc.LGMap().create(indices, bsize=lgmap.getBlockSize(), comm=lgmap.getComm())
-        return lgmaps
+        return {Vsub: unghosted_lgmap(Vsub, Vsub.dof_dset.lgmap, self.allow_repeated) for Vsub in self.V}
 
     def setup_block(self, Vrow, Vcol):
-        # Preallocate the auxiliary sparse operator
+        """Preallocate the auxiliary sparse operator."""
         sizes = tuple(Vsub.dof_dset.layout_vec.getSizes() for Vsub in (Vrow, Vcol))
         rmap = self.assembly_lgmaps[Vrow]
         cmap = self.assembly_lgmaps[Vcol]
         on_diag = Vrow == Vcol
         ptype = self.mat_type if on_diag else PETSc.Mat.Type.AIJ
 
-        preallocator = PETSc.Mat().create(comm=self.comm)
-        preallocator.setType(PETSc.Mat.Type.PREALLOCATOR)
-        preallocator.setSizes(sizes)
-        preallocator.setISAllowRepeated(self.allow_repeated)
-        preallocator.setLGMap(rmap, cmap)
-        preallocator.setOption(PETSc.Mat.Option.IGNORE_ZERO_ENTRIES, False)
-        if ptype.endswith("sbaij"):
-            preallocator.setOption(PETSc.Mat.Option.IGNORE_LOWER_TRIANGULAR, True)
-        preallocator.setUp()
+        preallocator = get_preallocator(self.comm, sizes, rmap, cmap, mat_type=ptype)
         self.set_values(preallocator, Vrow, Vcol)
         preallocator.assemble()
-        dnz, onz = get_preallocation(preallocator, sizes[0][0])
-        if on_diag:
-            numpy.maximum(dnz, 1, out=dnz)
+        P = allocate_matrix(preallocator, ptype, on_diag=on_diag, allow_repeated=self.allow_repeated)
         preallocator.destroy()
-        P = PETSc.Mat().create(comm=self.comm)
-        P.setType(ptype)
-        P.setSizes(sizes)
-        P.setISAllowRepeated(self.allow_repeated)
-        P.setLGMap(rmap, cmap)
-        if on_diag and ptype == "is" and self.allow_repeated:
-            bsize = Vrow.finat_element.space_dimension() * Vrow.block_size
+
+        if on_diag and P.type == "is" and self.allow_repeated:
+            bsize = Vrow.block_size * Vrow.finat_element.space_dimension()
             local_mat = P.getISLocalMat()
             nblocks = local_mat.getSize()[0] // bsize
             sizes = numpy.full((nblocks,), bsize, dtype=PETSc.IntType)
             local_mat.setVariableBlockSizes(sizes)
-        P.setPreallocationNNZ((dnz, onz))
-
-        if not (ptype.endswith("sbaij") or ptype == "is"):
-            P.setOption(PETSc.Mat.Option.UNUSED_NONZERO_LOCATION_ERR, True)
-        P.setOption(PETSc.Mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
-        P.setOption(PETSc.Mat.Option.STRUCTURALLY_SYMMETRIC, on_diag)
-        P.setOption(PETSc.Mat.Option.FORCE_DIAGONAL_ENTRIES, True)
-        P.setOption(PETSc.Mat.Option.KEEP_NONZERO_PATTERN, True)
-        if ptype.endswith("sbaij"):
-            P.setOption(PETSc.Mat.Option.IGNORE_LOWER_TRIANGULAR, True)
-        P.setUp()
         return P
 
     @PETSc.Log.EventDecorator("FDMSetValues")
@@ -1647,7 +1586,7 @@ def diff_blocks(tdim, formdegree, A00, A11, A10):
             A_blocks = [[A00.kron(A10)], [A10.kron(A00)]]
         elif formdegree == 1:
             A_blocks = [[A10.kron(A11), A11.kron(A10)]]
-            A_blocks[-1][-1].scale(-1)
+            A_blocks[0][0].scale(-1)
     elif tdim == 3:
         if formdegree == 0:
             A_blocks = [[kron3(A00, A00, A10)], [kron3(A00, A10, A00)], [kron3(A10, A00, A00)]]
@@ -1661,13 +1600,94 @@ def diff_blocks(tdim, formdegree, A00, A11, A10):
     return A_blocks
 
 
-def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[], comm=None):
-    """
-    Tabulate exterior derivative: Vc -> Vf as an explicit sparse matrix.
-    Works for any tensor-product basis. These are the same matrices one needs for HypreAMS and friends.
+def broken_function(V, val):
+    """Return a Function(V, val=val) interpolated onto the broken space."""
+    W = FunctionSpace(V.mesh(), restrict(V.ufl_element(), "broken"))
+    w = Function(W, dtype=val.dtype)
+    v = Function(V, val=val)
+    domain = "{[i]: 0 <= i < v.dofs}"
+    instructions = """
+    for i
+        w[i] = v[i]
+    end
     """
+    par_loop((domain, instructions), ufl.dx, {'w': (w, op2.WRITE), 'v': (v, op2.READ)})
+    return w
+
+
+def mask_local_indices(V, lgmap, allow_repeated):
+    """Return a numpy array with the masked local indices."""
+    mask = lgmap.indices
+    if allow_repeated:
+        w = broken_function(V, mask)
+        V = w.function_space()
+        mask = w.dat.data_ro_with_halos
+
+    indices = numpy.arange(mask.size, dtype=PETSc.IntType)
+    indices[mask == -1] = -1
+    indices_dat = V.make_dat(val=indices)
+    indices_acc = indices_dat(op2.READ, V.cell_node_map())
+    return indices_acc
+
+
+def unghosted_lgmap(V, lgmap, allow_repeated):
+    """Construct the local to global mapping for MatIS assembly."""
+    if allow_repeated:
+        indices = broken_function(V, lgmap.indices).dat.data_ro
+    else:
+        indices = lgmap.indices.copy()
+        local_indices = numpy.arange(indices.size, dtype=PETSc.IntType)
+        cell_node_map = broken_function(V, local_indices).dat.data_ro
+        ghost = numpy.setdiff1d(local_indices, numpy.unique(cell_node_map), assume_unique=True)
+        indices[ghost] = -1
+    return PETSc.LGMap().create(indices, bsize=lgmap.getBlockSize(), comm=lgmap.getComm())
+
+
+def get_preallocator(comm, sizes, rmap, cmap, mat_type=None):
+    """Set up a matrix preallocator."""
+    preallocator = PETSc.Mat().create(comm=comm)
+    preallocator.setType(PETSc.Mat.Type.PREALLOCATOR)
+    preallocator.setSizes(sizes)
+    preallocator.setLGMap(rmap, cmap)
+    preallocator.setOption(PETSc.Mat.Option.IGNORE_ZERO_ENTRIES, False)
+    if mat_type is not None and mat_type.endswith("sbaij"):
+        preallocator.setOption(PETSc.Mat.Option.IGNORE_LOWER_TRIANGULAR, True)
+    preallocator.setUp()
+    return preallocator
+
+
+def allocate_matrix(preallocator, mat_type, on_diag=False, allow_repeated=False):
+    """Set up a matrix from a preallocator."""
+    sizes = preallocator.getSizes()
+    nnz = get_preallocation(preallocator, sizes[0][0])
+    if on_diag:
+        numpy.maximum(nnz[0], 1, out=nnz[0])
+
+    A = PETSc.Mat().create(comm=preallocator.getComm())
+    A.setType(mat_type)
+    A.setSizes(sizes)
+    A.setBlockSize(preallocator.getBlockSize())
+    A.setISAllowRepeated(allow_repeated)
+    A.setLGMap(*preallocator.getLGMap())
+    A.setPreallocationNNZ(nnz)
+    if mat_type.endswith("sbaij"):
+        A.setOption(PETSc.Mat.Option.IGNORE_LOWER_TRIANGULAR, True)
+    if not (mat_type.endswith("sbaij") or mat_type == "is"):
+        A.setOption(PETSc.Mat.Option.UNUSED_NONZERO_LOCATION_ERR, True)
+    A.setOption(PETSc.Mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
+    A.setOption(PETSc.Mat.Option.STRUCTURALLY_SYMMETRIC, on_diag)
+    A.setOption(PETSc.Mat.Option.FORCE_DIAGONAL_ENTRIES, on_diag)
+    A.setOption(PETSc.Mat.Option.KEEP_NONZERO_PATTERN, True)
+    A.setUp()
+    return A
+
+
+def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[], comm=None, mat_type="aij", allow_repeated=False):
+    """Tabulate exterior derivative: Vc -> Vf as an explicit sparse matrix.
+       Works for any tensor-product basis. These are the same matrices one needs for HypreAMS and friends."""
     if comm is None:
         comm = Vf.comm
+
     ec = Vc.finat_element
     ef = Vf.finat_element
     if ef.formdegree - ec.formdegree != 1:
@@ -1718,29 +1738,30 @@ def tabulate_exterior_derivative(Vc, Vf, cbcs=[], fbcs=[], comm=None):
         temp.destroy()
         eye.destroy()
 
-    sizes = tuple(V.dof_dset.layout_vec.getSizes() for V in (Vf, Vc))
-    preallocator = PETSc.Mat().create(comm=comm)
-    preallocator.setType(PETSc.Mat.Type.PREALLOCATOR)
-    preallocator.setSizes(sizes)
-    preallocator.setUp()
-
-    kernel = ElementKernel(Dhat, name="exterior_derivative").kernel()
-    indices = tuple(op2.Dat(V.dof_dset, V.local_to_global_map(bcs).indices)(op2.READ, V.cell_node_map())
-                    for V, bcs in zip((Vf, Vc), (fbcs, cbcs)))
-    assembler = op2.ParLoop(kernel,
+    if mat_type != "is":
+        allow_repeated = False
+    spaces = (Vf, Vc)
+    bcs = (fbcs, cbcs)
+    lgmaps = tuple(V.local_to_global_map(bcs) for V, bcs in zip(spaces, bcs))
+    indices_acc = tuple(mask_local_indices(V, lgmap, allow_repeated) for V, lgmap in zip(spaces, lgmaps))
+    if mat_type == "is":
+        lgmaps = tuple(unghosted_lgmap(V, lgmap, allow_repeated) for V, lgmap in zip(spaces, lgmaps))
+
+    sizes = tuple(V.dof_dset.layout_vec.getSizes() for V in spaces)
+    preallocator = get_preallocator(comm, sizes, *lgmaps)
+
+    kernel = ElementKernel(Dhat, name="exterior_derivative")
+    assembler = op2.ParLoop(kernel.kernel(mat_type=mat_type),
                             Vc.mesh().cell_set,
-                            *(op2.PassthroughArg(op2.OpaqueType("Mat"), m.handle) for m in (preallocator, Dhat)),
-                            *indices)
+                            *kernel.make_args(preallocator),
+                            *indices_acc)
     assembler()
     preallocator.assemble()
-    nnz = get_preallocation(preallocator, sizes[0][0])
-    preallocator.destroy()
 
-    Dmat = PETSc.Mat().createAIJ(sizes, Vf.block_size, nnz=nnz, comm=comm)
-    Dmat.setOption(PETSc.Mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
+    Dmat = allocate_matrix(preallocator, mat_type, allow_repeated=allow_repeated)
     assembler.arguments[0].data = Dmat.handle
+    preallocator.destroy()
     assembler()
-
     Dmat.assemble()
     Dhat.destroy()
     return Dmat
@@ -2206,7 +2227,7 @@ def assemble_coefficients(self, J, fcp):
         if not isinstance(alpha, ufl.constantvalue.Zero):
             Q = FunctionSpace(mesh, finat.ufl.TensorElement(DG, shape=alpha.ufl_shape))
             tensor = coefficients.setdefault("alpha", Function(Q.dual()))
-            assembly_callables.append(partial(get_assembler(ufl.inner(TestFunction(Q), alpha)*dx, form_compiler_parameters=fcp).assemble, tensor=tensor))
+            assembly_callables.append(partial(get_assembler(ufl.inner(alpha, TestFunction(Q))*dx, form_compiler_parameters=fcp).assemble, tensor=tensor))
 
         # get zero-th order coefficent
         ref_val = [ufl.variable(t) for t in args_J]
@@ -2227,7 +2248,7 @@ def assemble_coefficients(self, J, fcp):
                 beta = ufl.diag_vector(beta)
             Q = FunctionSpace(mesh, finat.ufl.TensorElement(DG, shape=beta.ufl_shape) if beta.ufl_shape else DG)
             tensor = coefficients.setdefault("beta", Function(Q.dual()))
-            assembly_callables.append(partial(get_assembler(ufl.inner(TestFunction(Q), beta)*dx, form_compiler_parameters=fcp).assemble, tensor=tensor))
+            assembly_callables.append(partial(get_assembler(ufl.inner(beta, TestFunction(Q))*dx, form_compiler_parameters=fcp).assemble, tensor=tensor))
 
         family = "CG" if tdim == 1 else "DGT"
         degree = 1 if tdim == 1 else 0
@@ -2249,11 +2270,11 @@ def assemble_coefficients(self, J, fcp):
 
             Q = FunctionSpace(mesh, finat.ufl.TensorElement(DGT, shape=G.ufl_shape))
             tensor = coefficients.setdefault("Gq_facet", Function(Q.dual()))
-            assembly_callables.append(partial(get_assembler(ifacet_inner(TestFunction(Q), G), form_compiler_parameters=fcp).assemble, tensor=tensor))
+            assembly_callables.append(partial(get_assembler(ifacet_inner(G, TestFunction(Q)), form_compiler_parameters=fcp).assemble, tensor=tensor))
             PT = Piola.T
             Q = FunctionSpace(mesh, finat.ufl.TensorElement(DGT, shape=PT.ufl_shape))
             tensor = coefficients.setdefault("PT_facet", Function(Q.dual()))
-            assembly_callables.append(partial(get_assembler(ifacet_inner(TestFunction(Q), PT), form_compiler_parameters=fcp).assemble, tensor=tensor))
+            assembly_callables.append(partial(get_assembler(ifacet_inner(PT, TestFunction(Q)), form_compiler_parameters=fcp).assemble, tensor=tensor))
 
         # make DGT functions with BC flags
         shape = V.ufl_element().reference_value_shape
@@ -2273,7 +2294,7 @@ def assemble_coefficients(self, J, fcp):
                 if beta.ufl_shape:
                     beta = ufl.diag_vector(beta)
                 ds_ext = ufl.Measure(itype, domain=mesh, subdomain_id=it.subdomain_id(), metadata=md)
-                forms.append(ufl.inner(test, beta)*ds_ext)
+                forms.append(ufl.inner(beta, test)*ds_ext)
 
         tensor = coefficients.setdefault("bcflags", Function(Q.dual()))
         if len(forms):
diff --git a/pyop2/parloop.py b/pyop2/parloop.py
index c70f4c9fb7..6d208570b1 100644
--- a/pyop2/parloop.py
+++ b/pyop2/parloop.py
@@ -296,7 +296,8 @@ def replace_lgmaps(self):
                     olgmaps = []
                     for m, lgmaps in zip(pl_arg.data, pl_arg.lgmaps):
                         olgmaps.append(m.handle.getLGMap())
-                        m.handle.setLGMap(*lgmaps)
+                        if m.handle.type != "is":
+                            m.handle.setLGMap(*lgmaps)
                     orig_lgmaps.append(olgmaps)
         return tuple(orig_lgmaps)
 
@@ -309,7 +310,8 @@ def restore_lgmaps(self, orig_lgmaps):
         for arg, d in reversed(list(zip(self.global_kernel.arguments, self.arguments))):
             if isinstance(arg, (MatKernelArg, MixedMatKernelArg)) and d.lgmaps is not None:
                 for m, lgmaps in zip(d.data, orig_lgmaps.pop()):
-                    m.handle.setLGMap(*lgmaps)
+                    if m.handle.type != "is":
+                        m.handle.setLGMap(*lgmaps)
 
     @cached_property
     def _has_mats(self):
diff --git a/pyop2/types/mat.py b/pyop2/types/mat.py
index e03298360a..47e1cf8732 100644
--- a/pyop2/types/mat.py
+++ b/pyop2/types/mat.py
@@ -341,6 +341,55 @@ def masked_lgmap(lgmap, mask, block=True):
     return PETSc.LGMap().create(indices=indices, bsize=bsize, comm=lgmap.comm)
 
 
+def mask_ghost_cells(cell_node_map):
+    """Return the local indices of the nodes that belong to ghost cells."""
+    own_cells = cell_node_map.iterset.size
+    owned = cell_node_map.values[:own_cells]
+    ghost = cell_node_map.values_with_halo[own_cells:]
+    offset = cell_node_map.offset
+    if offset is None or ghost.size == 0:
+        # Non-extruded case
+        mask = np.setdiff1d(ghost, owned)
+    elif cell_node_map.iterset.constant_layers:
+        # Extruded case
+        mask_pieces = []
+        owned = owned.copy()
+        ghost = ghost.copy()
+        quotient = cell_node_map.offset_quotient
+        layers = cell_node_map.iterset.layers
+        for i in range(layers-1):
+            if quotient is not None and i == layers-2:
+                # Periodic extruded case
+                owned -= quotient
+                ghost -= quotient
+            mask_pieces.append(np.setdiff1d(ghost, owned))
+            owned += offset
+            ghost += offset
+        mask = np.concatenate(mask_pieces)
+    else:
+        raise NotImplementedError("MatIS does not support variable extrusion with overlap.")
+    return mask
+
+
+def unghosted_lgmap(dset, node_maps):
+    """Return a local-to-global map where the nodes on ghost cells are masked out."""
+    if len(node_maps) == 1:
+        # Non-mixed case
+        cmap, = node_maps
+        mask = mask_ghost_cells(cmap)
+    else:
+        # Mixed case
+        mask_pieces = []
+        for iset, cmap in zip(dset.local_ises, node_maps):
+            to_mask = mask_ghost_cells(cmap)
+            bs = iset.block_size
+            if bs > 1:
+                to_mask = np.concatenate([i + bs * to_mask for i in range(bs)])
+            mask_pieces.append(iset.indices[to_mask])
+        mask = np.concatenate(mask_pieces)
+    return masked_lgmap(dset.lgmap, mask)
+
+
 class AbstractMat(DataCarrier, abc.ABC):
     r"""OP2 matrix data. A ``Mat`` is defined on a sparsity pattern and holds a value
     for each element in the :class:`Sparsity`.
@@ -560,6 +609,7 @@ class Mat(AbstractMat):
 
     def __init__(self, *args, **kwargs):
         self.mat_type = kwargs.pop("mat_type", None)
+        self.sub_mat_type = kwargs.pop("sub_mat_type", None)
         super().__init__(*args, **kwargs)
         self._init()
         self.assembly_state = Mat.ASSEMBLED
@@ -616,13 +666,23 @@ def _init_dense(self):
     def _init_monolithic(self):
         mat = PETSc.Mat()
         rset, cset = self.sparsity.dsets
-        rlgmap = rset.unblocked_lgmap
-        clgmap = cset.unblocked_lgmap
-        mat.createAIJ(size=((self.nrows, None), (self.ncols, None)),
-                      nnz=(self.sparsity.nnz, self.sparsity.onnz),
-                      bsize=1,
-                      comm=self.comm)
+        if self.mat_type == "is":
+            rmaps = [None for _ in rset.local_ises]
+            cmaps = [None for _ in cset.local_ises]
+            for (i, j), maps_and_regions in self.sparsity._maps_and_regions.items():
+                for item in maps_and_regions:
+                    rmaps[i], cmaps[j], _ = item
+            rlgmap = unghosted_lgmap(rset, rmaps)
+            clgmap = unghosted_lgmap(cset, cmaps)
+            create = mat.createIS
+        else:
+            rlgmap = rset.unblocked_lgmap
+            clgmap = cset.unblocked_lgmap
+            create = mat.createAIJ
+        size = ((self.nrows, None), (self.ncols, None))
+        create(size, bsize=1, comm=self.comm)
         mat.setLGMap(rmap=rlgmap, cmap=clgmap)
+        mat.setPreallocationNNZ((self.sparsity.nnz, self.sparsity.onnz))
         self.handle = mat
         self._blocks = []
         rows, cols = self.sparsity.shape
@@ -635,7 +695,10 @@ def _init_monolithic(self):
         mat.setOption(mat.Option.KEEP_NONZERO_PATTERN, True)
         # We completely fill the allocated matrix when zeroing the
         # entries, so raise an error if we "missed" one.
-        mat.setOption(mat.Option.UNUSED_NONZERO_LOCATION_ERR, True)
+        if self.mat_type != "is":
+            # The local matrix will have fewer nonzeros than the one prescribed
+            # in the global sparsity pattern
+            mat.setOption(mat.Option.UNUSED_NONZERO_LOCATION_ERR, True)
         mat.setOption(mat.Option.IGNORE_OFF_PROC_ENTRIES, False)
         mat.setOption(mat.Option.NEW_NONZERO_ALLOCATION_ERR, True)
         # The first assembly (filling with zeros) sets all possible entries.
@@ -663,8 +726,10 @@ def _init_nest(self):
         for i in range(rows):
             row = []
             for j in range(cols):
+                # Only set sub_mat_type on the diagonal blocks
                 row.append(Mat(self.sparsity[i, j], self.dtype,
-                           '_'.join([self.name, str(i), str(j)])))
+                               '_'.join([self.name, str(i), str(j)]),
+                               mat_type=self.sub_mat_type if i == j else None))
             self._blocks.append(row)
         # PETSc Mat.createNest wants a flattened list of Mats
         mat.createNest([[m.handle for m in row_] for row_ in self._blocks],
@@ -685,7 +750,13 @@ def _init_block(self):
         col_lg = cset.lgmap
         rdim, cdim = self.dims[0][0]
 
-        if rdim == cdim and rdim > 1 and self.sparsity._block_sparse:
+        if self.mat_type == "is":
+            rmap, cmap, _ = tuple(self.sparsity._maps_and_regions[(0, 0)])[0]
+            row_lg = unghosted_lgmap(rset, [rmap])
+            col_lg = unghosted_lgmap(cset, [cmap])
+            block_sparse = False
+            create = mat.createIS
+        elif rdim == cdim and rdim > 1 and self.sparsity._block_sparse:
             # Size is total number of rows and columns, but the
             # /sparsity/ is the block sparsity.
             block_sparse = True
@@ -695,12 +766,11 @@ def _init_block(self):
             # the /dof/ sparsity.
             block_sparse = False
             create = mat.createAIJ
-        create(size=((self.nrows, None),
-                     (self.ncols, None)),
-               nnz=(self.sparsity.nnz, self.sparsity.onnz),
-               bsize=(rdim, cdim),
-               comm=self.comm)
+        size = ((self.nrows, None), (self.ncols, None))
+        create(size, bsize=(rdim, cdim), comm=self.comm)
+
         mat.setLGMap(rmap=row_lg, cmap=col_lg)
+        mat.setPreallocationNNZ((self.sparsity.nnz, self.sparsity.onnz))
         # Stash entries destined for other processors
         mat.setOption(mat.Option.IGNORE_OFF_PROC_ENTRIES, False)
         # Any add or insertion that would generate a new entry that has not
@@ -716,7 +786,8 @@ def _init_block(self):
         mat.setOption(mat.Option.KEEP_NONZERO_PATTERN, True)
         # We completely fill the allocated matrix when zeroing the
         # entries, so raise an error if we "missed" one.
-        mat.setOption(mat.Option.UNUSED_NONZERO_LOCATION_ERR, True)
+        if self.mat_type != "is":
+            mat.setOption(mat.Option.UNUSED_NONZERO_LOCATION_ERR, True)
         # Put zeros in all the places we might eventually put a value.
         with profiling.timed_region("MatZeroInitial"):
             sparsity.fill_with_zeros(mat, self.sparsity.dims[0][0],
@@ -854,11 +925,15 @@ def set_local_diagonal_entries(self,
             else:
                 rows = np.dstack([rbs*rows + i for i in range(rbs)]).flatten()
         rows = rows.reshape(-1, 1)
-        self.change_assembly_state(Mat.INSERT_VALUES)
-        if len(rows) > 0:
-            values = np.full(rows.shape, diag_val, dtype=dtypes.ScalarType)
-            self.handle.setValuesLocalRCV(rows, rows, values,
-                                          addv=PETSc.InsertMode.INSERT_VALUES)
+        if self.handle.type == "is":
+            self.handle.assemble()
+            self.handle.zeroRowsColumnsLocal(rows, diag_val)
+        else:
+            self.change_assembly_state(Mat.INSERT_VALUES)
+            if len(rows) > 0:
+                values = np.full(rows.shape, diag_val, dtype=dtypes.ScalarType)
+                self.handle.setValuesLocalRCV(rows, rows, values,
+                                              addv=PETSc.InsertMode.INSERT_VALUES)
 
     @mpi.collective
     def assemble(self):
@@ -982,11 +1057,15 @@ def set_local_diagonal_entries(self, rows, diag_val=1.0, idx=None):
             else:
                 rows = np.dstack([rbs*rows + i for i in range(rbs)]).flatten()
         rows = rows.reshape(-1, 1)
-        self.change_assembly_state(Mat.INSERT_VALUES)
-        if len(rows) > 0:
-            values = np.full(rows.shape, diag_val, dtype=dtypes.ScalarType)
-            self.handle.setValuesLocalRCV(rows, rows, values,
-                                          addv=PETSc.InsertMode.INSERT_VALUES)
+        if self.handle.type == "is":
+            self.handle.assemble()
+            self.handle.zeroRowsColumnsLocal(rows, diag_val)
+        else:
+            self.change_assembly_state(Mat.INSERT_VALUES)
+            if len(rows) > 0:
+                values = np.full(rows.shape, diag_val, dtype=dtypes.ScalarType)
+                self.handle.setValuesLocalRCV(rows, rows, values,
+                                              addv=PETSc.InsertMode.INSERT_VALUES)
 
     def addto_values(self, rows, cols, values):
         """Add a block of values to the :class:`Mat`."""
diff --git a/tests/firedrake/regression/test_assemble.py b/tests/firedrake/regression/test_assemble.py
index 94e3439c6e..9972715746 100644
--- a/tests/firedrake/regression/test_assemble.py
+++ b/tests/firedrake/regression/test_assemble.py
@@ -115,6 +115,76 @@ def test_mat_nest_real_block_assembler_correctly_reuses_tensor(mesh):
     assert A2.M is A1.M
 
 
+@pytest.mark.parallel
+@pytest.mark.parametrize("shape,mat_type", [("scalar", "is"), ("vector", "is"), ("mixed", "is"), ("mixed", "nest")])
+@pytest.mark.parametrize("dirichlet_bcs", [False, True])
+def test_assemble_matis(mesh, shape, mat_type, dirichlet_bcs):
+    if shape == "scalar":
+        V = FunctionSpace(mesh, "CG", 1)
+    elif shape == "vector":
+        V = VectorFunctionSpace(mesh, "CG", 1, dim=3)
+    elif shape == "mixed":
+        V = VectorFunctionSpace(mesh, "CG", 1)
+        Q = FunctionSpace(mesh, "CG", 1)
+        V = V * Q
+    else:
+        raise ValueError(f"Unrecognized shape {shape}.")
+
+    if V.value_size == 1:
+        A = 1
+    else:
+        A = as_matrix([[2, -1, 0], [-1, 2, -1], [0, -1, 2]])
+
+    u = TrialFunction(V)
+    v = TestFunction(V)
+    a = inner(A * grad(u), grad(v))*dx
+    if dirichlet_bcs:
+        subspaces = [V] if len(V) == 1 else [V.sub(i) for i in range(len(V))]
+        components = []
+        for i, Vi in enumerate(subspaces):
+            if Vi.block_size == 1:
+                components.append(Vi)
+            else:
+                components.extend(Vi.sub(j) for j in range(Vi.block_size))
+
+        assert len(components) == V.value_size
+        bcs = [DirichletBC(components[i], 0, (i % 4+1, (i+2) % 4+1)) for i in range(len(components))]
+    else:
+        bcs = None
+
+    aij_ref = assemble(a, bcs=bcs, mat_type="aij").petscmat
+    ais = assemble(a, bcs=bcs, mat_type=mat_type, sub_mat_type="is").petscmat
+
+    aij = PETSc.Mat()
+    if ais.type == "nest":
+        blocks = []
+        for i in range(len(V)):
+            row = []
+            for j in range(len(V)):
+                bis = ais.getNestSubMatrix(i, j)
+                if i == j:
+                    assert bis.type == "is"
+                    bij = PETSc.Mat()
+                    bis.convert("aij", bij)
+                else:
+                    bij = bis
+                row.append(bij)
+            blocks.append(row)
+        anest = PETSc.Mat()
+        anest.createNest(blocks,
+                         isrows=V.dof_dset.field_ises,
+                         iscols=V.dof_dset.field_ises,
+                         comm=ais.comm)
+        anest.convert("aij", aij)
+    else:
+        assert ais.type == "is"
+        ais.convert("aij", aij)
+
+    aij_ref.axpy(-1, aij)
+    ind, iptr, values = aij_ref.getValuesCSR()
+    assert np.allclose(values, 0)
+
+
 def test_assemble_diagonal(mesh):
     V = FunctionSpace(mesh, "P", 3)
     u = TrialFunction(V)
diff --git a/tests/firedrake/regression/test_bddc.py b/tests/firedrake/regression/test_bddc.py
index 57f5d42d52..c111c07181 100644
--- a/tests/firedrake/regression/test_bddc.py
+++ b/tests/firedrake/regression/test_bddc.py
@@ -3,28 +3,35 @@
 from firedrake.petsc import DEFAULT_DIRECT_SOLVER
 
 
-def bddc_params(static_condensation):
+@pytest.fixture
+def rg():
+    return RandomGenerator(PCG64(seed=123456789))
+
+
+def bddc_params():
     chol = {
         "pc_type": "cholesky",
-        "pc_factor_mat_solver_type": "petsc",
-        "pc_factor_mat_ordering_type": "natural",
+        "pc_factor_mat_solver_type": DEFAULT_DIRECT_SOLVER,
     }
     sp = {
+        "mat_type": "is",
         "pc_type": "python",
         "pc_python_type": "firedrake.BDDCPC",
         "bddc_pc_bddc_neumann": chol,
         "bddc_pc_bddc_dirichlet": chol,
-        "bddc_pc_bddc_coarse": DEFAULT_DIRECT_SOLVER,
+        "bddc_pc_bddc_coarse": chol,
     }
     return sp
 
 
-def solver_parameters(static_condensation=True):
-    rtol = 1E-8
-    atol = 1E-12
-    sp_bddc = bddc_params(static_condensation)
-    repeated = True
-    if static_condensation:
+def solver_parameters(cellwise=False, condense=False, variant=None, rtol=1E-10, atol=0):
+    sp_bddc = bddc_params()
+    if not cellwise:
+        assert not condense
+        sp = sp_bddc
+
+    elif condense:
+        assert variant == "fdm"
         sp = {
             "pc_type": "python",
             "pc_python_type": "firedrake.FacetSplitPC",
@@ -33,36 +40,42 @@ def solver_parameters(static_condensation=True):
             "facet_fdm_static_condensation": True,
             "facet_fdm_pc_use_amat": False,
             "facet_fdm_mat_type": "is",
-            "facet_fdm_mat_is_allow_repeated": repeated,
+            "facet_fdm_mat_is_allow_repeated": cellwise,
             "facet_fdm_pc_type": "fieldsplit",
             "facet_fdm_pc_fieldsplit_type": "symmetric_multiplicative",
             "facet_fdm_pc_fieldsplit_diag_use_amat": False,
             "facet_fdm_pc_fieldsplit_off_diag_use_amat": False,
             "facet_fdm_fieldsplit_ksp_type": "preonly",
-            "facet_fdm_fieldsplit_0_pc_type": "jacobi",
+            "facet_fdm_fieldsplit_0_pc_type": "bjacobi",
+            "facet_fdm_fieldsplit_0_pc_type_sub_pc_type": "icc",
             "facet_fdm_fieldsplit_1": sp_bddc,
         }
     else:
+        assert variant == "fdm"
         sp = {
             "pc_type": "python",
             "pc_python_type": "firedrake.FDMPC",
             "fdm_pc_use_amat": False,
-            "fdm_mat_type": "is",
-            "fdm_mat_is_allow_repeated": repeated,
+            "fdm_mat_is_allow_repeated": cellwise,
             "fdm": sp_bddc,
         }
+
     sp.update({
-        "mat_type": "matfree",
         "ksp_type": "cg",
+        "ksp_max_it": 20,
         "ksp_norm_type": "natural",
-        "ksp_monitor": None,
+        "ksp_converged_reason": None,
         "ksp_rtol": rtol,
         "ksp_atol": atol,
     })
+    if variant == "fdm":
+        sp["mat_type"] = "matfree"
     return sp
 
 
-def solve_riesz_map(mesh, family, degree, bcs, condense):
+def solve_riesz_map(rg, mesh, family, degree, variant, bcs, cellwise=False, condense=False, vector=False):
+    """Solve the riesz map for a random manufactured solution and return the
+       square root of the estimated condition number."""
     dirichlet_ids = []
     if bcs:
         dirichlet_ids = ["on_boundary"]
@@ -74,7 +87,10 @@ def solve_riesz_map(mesh, family, degree, bcs, condense):
         family = "RTCE" if tdim == 2 else "NCE"
     if family.endswith("F"):
         family = "RTCF" if tdim == 2 else "NCF"
-    V = FunctionSpace(mesh, family, degree, variant="fdm")
+
+    fs = VectorFunctionSpace if vector else FunctionSpace
+
+    V = fs(mesh, family, degree, variant=variant)
     v = TestFunction(V)
     u = TrialFunction(V)
     d = {
@@ -85,46 +101,97 @@ def solve_riesz_map(mesh, family, degree, bcs, condense):
 
     formdegree = V.finat_element.formdegree
     if formdegree == 0:
-        a = inner(d(u), d(v)) * dx(degree=2*degree)
+        a = inner(d(u), d(v)) * dx
     else:
-        a = (inner(u, v) + inner(d(u), d(v))) * dx(degree=2*degree)
+        a = (inner(u, v) + inner(d(u), d(v))) * dx
 
-    rg = RandomGenerator(PCG64(seed=123456789))
     u_exact = rg.uniform(V, -1, 1)
     L = ufl.replace(a, {u: u_exact})
     bcs = [DirichletBC(V, u_exact, sub) for sub in dirichlet_ids]
     nsp = None
     if formdegree == 0:
-        nsp = VectorSpaceBasis([Function(V).interpolate(Constant(1))])
+        b = np.zeros(V.value_shape)
+        expr = Constant(b)
+        basis = []
+        for i in np.ndindex(V.value_shape):
+            b[...] = 0
+            b[i] = 1
+            expr.assign(b)
+            basis.append(Function(V).interpolate(expr))
+        nsp = VectorSpaceBasis(basis)
         nsp.orthonormalize()
 
     uh = Function(V, name="solution")
     problem = LinearVariationalProblem(a, L, uh, bcs=bcs)
 
-    sp = solver_parameters(condense)
+    rtol = 1E-8
+    sp = solver_parameters(cellwise=cellwise, condense=condense, variant=variant, rtol=rtol)
+    sp.setdefault("ksp_view_singularvalues", None)
     solver = LinearVariationalSolver(problem, near_nullspace=nsp,
-                                     solver_parameters=sp,
-                                     options_prefix="")
+                                     solver_parameters=sp)
     solver.solve()
-    return solver.snes.getLinearSolveIterations()
+    uerr = Function(V).assign(uh - u_exact)
+    assert (assemble(a(uerr, uerr)) / assemble(a(u_exact, u_exact))) ** 0.5 < rtol
+
+    ew = solver.snes.ksp.computeEigenvalues()
+    assert min(ew) >= 1.0
+    kappa = max(abs(ew)) / min(abs(ew))
+    return kappa ** 0.5
 
 
 @pytest.fixture(params=(2, 3), ids=("square", "cube"))
-def mesh(request):
+def mh(request):
     dim = request.param
     nx = 4
-    msh = UnitSquareMesh(nx, nx, quadrilateral=True)
+    base = UnitSquareMesh(nx, nx, quadrilateral=True)
+    mh = MeshHierarchy(base, 1)
     if dim == 3:
-        msh = ExtrudedMesh(msh, nx)
-    return msh
+        mh = ExtrudedMeshHierarchy(mh, height=1, base_layer=nx)
+    return mh
 
 
 @pytest.mark.parallel
-@pytest.mark.parametrize("family", "Q")
-@pytest.mark.parametrize("degree", (4,))
+@pytest.mark.parametrize("degree", range(1, 3))
+@pytest.mark.parametrize("variant", ("spectral", "fdm"))
+def test_vertex_dofs(mh, variant, degree):
+    """Check that we extract the right number of vertex dofs from a high order Lagrange space."""
+    from firedrake.preconditioners.bddc import get_restricted_dofs
+    mesh = mh[-1]
+    P1 = FunctionSpace(mesh, "Lagrange", 1, variant=variant)
+    V0 = FunctionSpace(mesh, "Lagrange", degree, variant=variant)
+    v = get_restricted_dofs(V0, "vertex")
+    assert v.getSizes() == P1.dof_dset.layout_vec.getSizes()
+
+
+@pytest.mark.parallel([1, 3])
+@pytest.mark.parametrize("family,degree", [("Q", 4), ("E", 3), ("F", 3)])
 @pytest.mark.parametrize("condense", (False, True))
-def test_bddc_fdm(mesh, family, degree, condense):
+def test_bddc_cellwise_fdm(rg, mh, family, degree, condense):
+    """Test h-independence of condition number by measuring iteration counts"""
+    variant = "fdm"
     bcs = True
-    tdim = mesh.topological_dimension
-    expected = 7 if tdim == 2 else 11
-    assert solve_riesz_map(mesh, family, degree, bcs, condense) <= expected
+    sqrt_kappa = [solve_riesz_map(rg, m, family, degree, variant, bcs, cellwise=True, condense=condense) for m in mh]
+    assert (np.diff(sqrt_kappa) <= 0.1).all(), str(sqrt_kappa)
+
+
+@pytest.mark.parallel
+@pytest.mark.parametrize("family,degree", [("Q", 4)])
+@pytest.mark.parametrize("vector", (False, True), ids=("scalar", "vector"))
+def test_bddc_aij_quad(rg, mh, family, degree, vector):
+    """Test h-dependence of condition number by measuring iteration counts"""
+    variant = None
+    bcs = True
+    sqrt_kappa = [solve_riesz_map(rg, m, family, degree, variant, bcs, vector=vector) for m in mh]
+    assert (np.diff(sqrt_kappa) <= 0.5).all(), str(sqrt_kappa)
+
+
+@pytest.mark.parallel
+@pytest.mark.parametrize("family,degree", [("CG", 3), ("N1curl", 3), ("N1div", 3)])
+def test_bddc_aij_simplex(rg, family, degree):
+    """Test h-dependence of condition number by measuring iteration counts"""
+    variant = None
+    bcs = True
+    base = UnitCubeMesh(2, 2, 2)
+    meshes = MeshHierarchy(base, 2)
+    sqrt_kappa = [solve_riesz_map(rg, m, family, degree, variant, bcs) for m in meshes]
+    assert (np.diff(sqrt_kappa) <= 0.5).all(), str(sqrt_kappa)
diff --git a/tests/firedrake/regression/test_fdm.py b/tests/firedrake/regression/test_fdm.py
index 90c7095666..872b5eeb9b 100644
--- a/tests/firedrake/regression/test_fdm.py
+++ b/tests/firedrake/regression/test_fdm.py
@@ -1,4 +1,5 @@
 import pytest
+import numpy
 from firedrake import *
 from pyop2.utils import as_tuple
 from firedrake.petsc import DEFAULT_DIRECT_SOLVER
@@ -334,3 +335,73 @@ def test_ipdg_direct_solver(fs):
             assert uvec.norm() < 1E-8
     else:
         assert norm(u_exact-uh, "H1") < 1.0E-8
+
+
+@pytest.mark.parallel(nprocs=2)
+@pytest.mark.parametrize("mat_type", ("aij",))
+@pytest.mark.parametrize("variant,degree", [("spectral", 1), ("spectral", 4), ("integral", 4), ("fdm", 4)])
+def test_tabulate_gradient(mesh, variant, degree, mat_type):
+    from firedrake.preconditioners.fdm import tabulate_exterior_derivative
+    tdim = mesh.topological_dimension
+    family = {1: "DG", 2: "RTCE", 3: "NCE"}[tdim]
+
+    V0 = FunctionSpace(mesh, "Lagrange", degree, variant=variant)
+    V1 = FunctionSpace(mesh, family, degree-(tdim == 1), variant=variant)
+    D = tabulate_exterior_derivative(V0, V1, mat_type=mat_type)
+
+    M = assemble(inner(TrialFunction(V1), TestFunction(V1))*dx).petscmat
+    Dij = D if D.type.endswith("aij") else D.convert(M.type, PETSc.Mat())
+    B = M.matMult(Dij)
+
+    Bref = assemble(inner(grad(TrialFunction(V0)), TestFunction(V1))*dx).petscmat
+    Bref.axpy(-1, B)
+    _, _, vals = Bref.getValuesCSR()
+    assert numpy.allclose(vals, 0)
+
+
+@pytest.mark.parallel(nprocs=2)
+@pytest.mark.parametrize("mat_type", ("aij",))
+@pytest.mark.parametrize("variant,degree", [("spectral", 1), ("spectral", 4), ("integral", 4), ("fdm", 4)])
+def test_tabulate_curl(mesh, variant, degree, mat_type):
+    from firedrake.preconditioners.fdm import tabulate_exterior_derivative
+    tdim = mesh.topological_dimension
+    family1 = {1: "CG", 2: "CG", 3: "NCE"}[tdim]
+    family2 = {1: "DG", 2: "RTCF", 3: "NCF"}[tdim]
+
+    V1 = FunctionSpace(mesh, family1, degree, variant=variant)
+    V2 = FunctionSpace(mesh, family2, degree-(tdim == 1), variant=variant)
+    D = tabulate_exterior_derivative(V1, V2, mat_type=mat_type)
+
+    M = assemble((-1)**(tdim-1)*inner(TrialFunction(V2), TestFunction(V2))*dx).petscmat
+    Dij = D if D.type.endswith("aij") else D.convert(M.type, PETSc.Mat())
+    B = M.matMult(Dij)
+
+    Bref = assemble(inner(curl(TrialFunction(V1)), TestFunction(V2))*dx).petscmat
+    Bref.axpy(-1, B)
+    _, _, vals = Bref.getValuesCSR()
+    assert numpy.allclose(vals, 0)
+
+
+@pytest.mark.parallel(nprocs=2)
+@pytest.mark.parametrize("mat_type", ("aij", "is"))
+@pytest.mark.parametrize("variant,degree", [("spectral", 1), ("spectral", 4), ("integral", 4), ("fdm", 4)])
+def test_tabulate_divergence(mesh, variant, degree, mat_type):
+    from firedrake.preconditioners.fdm import tabulate_exterior_derivative
+    tdim = mesh.topological_dimension
+    family = {1: "CG", 2: "RTCF", 3: "NCF"}[tdim]
+
+    V = FunctionSpace(mesh, family, degree, variant=variant)
+    Q = FunctionSpace(mesh, "DG", 0, variant=f"integral({degree-1})")
+    D = tabulate_exterior_derivative(V, Q, mat_type=mat_type, allow_repeated=True)
+
+    # Fix scale
+    Jdet = JacobianDeterminant(mesh)
+    M = assemble(inner(TrialFunction(Q)*(1/Jdet), TestFunction(Q))*dx, diagonal=True)
+    with M.dat.vec as mvec:
+        D.diagonalScale(mvec, None)
+
+    Dref = assemble(inner(div(TrialFunction(V)), TestFunction(Q))*dx).petscmat
+    Dij = D if D.type.endswith("aij") else D.convert(Dref.type, PETSc.Mat())
+    Dref.axpy(-1, Dij)
+    _, _, vals = Dref.getValuesCSR()
+    assert numpy.allclose(vals, 0)