diff --git a/ext/OceananigansEnzymeExt.jl b/ext/OceananigansEnzymeExt.jl
index ce1662d32f..c3b4954de8 100644
--- a/ext/OceananigansEnzymeExt.jl
+++ b/ext/OceananigansEnzymeExt.jl
@@ -228,7 +228,7 @@ function EnzymeCore.EnzymeRules.augmented_primal(config,
                                                  include_right_boundaries = false,
                                                  reduced_dimensions = (),
                                                  location = nothing,
-                                                 only_active_cells = nothing,
+                                                 active_cells_map = nothing,
                                                  kwargs...) where N
 
 
@@ -239,8 +239,8 @@ function EnzymeCore.EnzymeRules.augmented_primal(config,
 
     offset = Oceananigans.Utils.offsets(workspec.val)
 
-    if !isnothing(only_active_cells) 
-        workgroup, worksize = Oceananigans.Utils.active_cells_work_layout(workgroup, worksize, only_active_cells, grid.val) 
+    if !isnothing(active_cells_map) 
+        workgroup, worksize = Oceananigans.Utils.active_cells_work_layout(workgroup, worksize, active_cells_map, grid.val) 
         offset = nothing
     end
 
@@ -286,7 +286,7 @@ function EnzymeCore.EnzymeRules.reverse(config::EnzymeCore.EnzymeRules.ConfigWid
                                                  include_right_boundaries = false,
                                                  reduced_dimensions = (),
                                                  location = nothing,
-                                                 only_active_cells = nothing,
+                                                 active_cells_map = nothing,
                                                  kwargs...) where N
 
   subrets = if tape !== nothing
diff --git a/src/Advection/tracer_advection_operators.jl b/src/Advection/tracer_advection_operators.jl
index b01f2471ff..86cc43229b 100644
--- a/src/Advection/tracer_advection_operators.jl
+++ b/src/Advection/tracer_advection_operators.jl
@@ -1,6 +1,31 @@
 using Oceananigans.Operators: Vᶜᶜᶜ
 using Oceananigans.Fields: ZeroField
 
+struct TracerAdvection{N, FT, A, B, C} <: AbstractAdvectionScheme{N, FT}
+    x :: A
+    y :: B
+    z :: C
+
+    TracerAdvection{N, FT}(x::A, y::B, z::C) where {N, FT, A, B, C} = new{N, FT, A, B, C}(x, y, z)
+end
+
+"""
+    function TracerAdvection(x, y, z)
+
+Builds a `TracerAdvection` type with reconstructions schemes `x`, `y`, and `z` to be applied in
+the x, y, and z direction, respectively.
+"""
+function TracerAdvection(x_advection, y_advection, z_advection)
+    Hx = required_halo_size(x_advection)
+    Hy = required_halo_size(y_advection)
+    Hz = required_halo_size(z_advection)
+
+    FT = eltype(x_advection)
+    H = max(Hx, Hy, Hz)
+
+    return TracerAdvection{H, FT}(x_advection, y_advection, z_advection)
+end
+
 @inline _advective_tracer_flux_x(args...) = advective_tracer_flux_x(args...)
 @inline _advective_tracer_flux_y(args...) = advective_tracer_flux_y(args...)
 @inline _advective_tracer_flux_z(args...) = advective_tracer_flux_z(args...)
@@ -32,3 +57,9 @@ which ends up at the location `ccc`.
                                     δyᵃᶜᵃ(i, j, k, grid, _advective_tracer_flux_y, advection, U.v, c) +
                                     δzᵃᵃᶜ(i, j, k, grid, _advective_tracer_flux_z, advection, U.w, c))
 end
+
+@inline function div_Uc(i, j, k, grid, advection::TracerAdvection, U, c)
+    return 1/Vᶜᶜᶜ(i, j, k, grid) * (δxᶜᵃᵃ(i, j, k, grid, _advective_tracer_flux_x, advection.x, U.u, c) +
+                                    δyᵃᶜᵃ(i, j, k, grid, _advective_tracer_flux_y, advection.y, U.v, c) +
+                                    δzᵃᵃᶜ(i, j, k, grid, _advective_tracer_flux_z, advection.z, U.w, c))
+end
diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index 66b543dada..7530e1788c 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -111,9 +111,9 @@ Vector Invariant, Dimension-by-dimension reconstruction
 function VectorInvariant(; vorticity_scheme = EnstrophyConserving(),
                            vorticity_stencil = VelocityStencil(),
                            vertical_scheme = EnergyConserving(),
-                           kinetic_energy_gradient_scheme = vertical_scheme,
                            divergence_scheme = vertical_scheme,
-                           upwinding  = OnlySelfUpwinding(; cross_scheme = vertical_scheme),
+                           kinetic_energy_gradient_scheme = divergence_scheme,
+                           upwinding  = OnlySelfUpwinding(; cross_scheme = divergence_scheme),
                            multi_dimensional_stencil = false)
 
     N = required_halo_size(vorticity_scheme)
@@ -144,10 +144,10 @@ const VectorInvariantKEGradientEnergyConserving = VectorInvariant{<:Any, <:Any,
 const VectorInvariantKineticEnergyUpwinding     = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme}
 
 
-#                                                 VectorInvariant{N,     FT,    M,     Z,     ZS,    V,                                     K,     D,     U (upwinding)
-const VectorInvariantCrossVerticalUpwinding     = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:Any, <:CrossAndSelfUpwinding}
-const VectorInvariantSelfVerticalUpwinding      = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:Any, <:OnlySelfUpwinding}
-const VectorInvariantVelocityVerticalUpwinding  = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:Any, <:VelocityUpwinding}
+#                                                 VectorInvariant{N,     FT,    M,     Z,     ZS,     V,     K,     D,                                     U (upwinding)
+const VectorInvariantCrossVerticalUpwinding     = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any,  <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:CrossAndSelfUpwinding}
+const VectorInvariantSelfVerticalUpwinding      = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any,  <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:OnlySelfUpwinding}
+const VectorInvariantVelocityVerticalUpwinding  = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any,  <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:VelocityUpwinding}
 
 Base.summary(a::VectorInvariant)                 = string("Vector Invariant, Dimension-by-dimension reconstruction")
 Base.summary(a::MultiDimensionalVectorInvariant) = string("Vector Invariant, Multidimensional reconstruction")
@@ -166,10 +166,7 @@ Base.show(io::IO, a::VectorInvariant{N, FT}) where {N, FT} =
 ##### Convenience for WENO Vector Invariant
 #####
 
-#                           VectorInvariant{N,     FT,    M,     Z (vorticity scheme), ZS,    V (vertical scheme),      K (kinetic energy gradient scheme)
-const WENOVectorInvariant = VectorInvariant{<:Any, <:Any, <:Any, <:WENO, <:Any, <:WENO, <:WENO}
-
-nothing_to_default(user_value, default) = isnothing(user_value) ? default : user_value
+nothing_to_default(user_value; default) = isnothing(user_value) ? default : user_value
 
 """
     WENOVectorInvariant(; upwinding = nothing,
@@ -188,23 +185,23 @@ function WENOVectorInvariant(; upwinding = nothing,
                                weno_kw...)
 
     if isnothing(order) # apply global defaults
-        vorticity_order               = nothing_to_default(vorticity_order, default=9)
-        vertical_order                = nothing_to_default(vertical_order, default=5)
-        divergence_order              = nothing_to_default(divergence_order, default=5)
-        kinetic_energy_gradient_order = nothing_to_default(kinetic_energy_gradient_order, default=5)
+        vorticity_order               = nothing_to_default(vorticity_order,  default = 9)
+        vertical_order                = nothing_to_default(vertical_order,   default = 5)
+        divergence_order              = nothing_to_default(divergence_order, default = 5)
+        kinetic_energy_gradient_order = nothing_to_default(kinetic_energy_gradient_order, default = 5)
     else # apply user supplied `order` unless overridden by more specific value
-        vorticity_order               = nothing_to_default(vorticity_order, default=order)
-        vertical_order                = nothing_to_default(vertical_order, default=order)
-        divergence_order              = nothing_to_default(divergence_order, default=order)
-        kinetic_energy_gradient_order = nothing_to_default(kinetic_energy_gradient_order, default=order)
+        vorticity_order               = nothing_to_default(vorticity_order,  default = order)
+        vertical_order                = nothing_to_default(vertical_order,   default = order)
+        divergence_order              = nothing_to_default(divergence_order, default = order)
+        kinetic_energy_gradient_order = nothing_to_default(kinetic_energy_gradient_order, default = order)
     end
 
-    vorticity_scheme               = WENO(; order=vorticity_order, weno_kw...)
-    vertical_scheme                = WENO(; order=vertical_order, weno_kw...)
-    kinetic_energy_gradient_scheme = WENO(; order=kinetic_energy_gradient_order, weno_kw...)
-    divergence_scheme              = WENO(; order=divergence_order, weno_kw...)
+    vorticity_scheme               = WENO(; order = vorticity_order, weno_kw...)
+    vertical_scheme                = WENO(; order = vertical_order, weno_kw...)
+    kinetic_energy_gradient_scheme = WENO(; order = kinetic_energy_gradient_order, weno_kw...)
+    divergence_scheme              = WENO(; order = divergence_order, weno_kw...)
 
-    default_upwinding = OnlySelfUpwinding(cross_scheme=divergence_scheme)
+    default_upwinding = OnlySelfUpwinding(cross_scheme = divergence_scheme)
     upwinding = nothing_to_default(upwinding; default = default_upwinding)
 
     schemes = (vorticity_scheme, vertical_scheme, kinetic_energy_gradient_scheme, divergence_scheme)
diff --git a/src/Architectures.jl b/src/Architectures.jl
index 6dbc3cf2b1..78b55a067a 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -115,4 +115,10 @@ end
 @inline unsafe_free!(a::CuArray) = CUDA.unsafe_free!(a)
 @inline unsafe_free!(a)          = nothing
 
+# Convert arguments to GPU-compatible types
+@inline convert_args(::CPU, args) = args
+@inline convert_args(::GPU, args) = CUDA.cudaconvert(args)
+@inline convert_args(::GPU, args::Tuple) = map(CUDA.cudaconvert, args)
+
 end # module
+
diff --git a/src/DistributedComputations/distributed_architectures.jl b/src/DistributedComputations/distributed_architectures.jl
index 055bbb93b1..2aa2a53e45 100644
--- a/src/DistributedComputations/distributed_architectures.jl
+++ b/src/DistributedComputations/distributed_architectures.jl
@@ -2,7 +2,7 @@ using Oceananigans.Architectures
 using Oceananigans.Grids: topology, validate_tupled_argument
 using CUDA: ndevices, device!
 
-import Oceananigans.Architectures: device, cpu_architecture, arch_array, array_type, child_architecture
+import Oceananigans.Architectures: device, cpu_architecture, arch_array, array_type, child_architecture, convert_args
 import Oceananigans.Grids: zeros
 import Oceananigans.Utils: sync_device!, tupleit
 
@@ -210,7 +210,7 @@ function Distributed(child_architecture = CPU();
                      partition = Partition(MPI.Comm_size(communicator)))
 
     if !(MPI.Initialized())
-        @info "MPI has not been initialized, so we are calling MPI.Init()".
+        @info "MPI has not been initialized, so we are calling MPI.Init()."
         MPI.Init()
     end
 
@@ -265,6 +265,7 @@ arch_array(arch::Distributed, A)      = arch_array(child_architecture(arch), A)
 zeros(FT, arch::Distributed, N...)    = zeros(FT, child_architecture(arch), N...)
 array_type(arch::Distributed)         = array_type(child_architecture(arch))
 sync_device!(arch::Distributed)       = sync_device!(arch.child_architecture)
+convert_args(arch::Distributed, arg)  = convert_args(child_architecture(arch), arg)
 
 cpu_architecture(arch::DistributedCPU) = arch
 cpu_architecture(arch::Distributed{A, S}) where {A, S} = 
diff --git a/src/DistributedComputations/distributed_fields.jl b/src/DistributedComputations/distributed_fields.jl
index ebffdf4f76..878646662e 100644
--- a/src/DistributedComputations/distributed_fields.jl
+++ b/src/DistributedComputations/distributed_fields.jl
@@ -39,16 +39,19 @@ end
 function set!(u::DistributedField, v::Union{Array, CuArray})
     gsize = global_size(architecture(u), size(u))
 
-    if size(v) == size(u)
-        f = arch_array(architecture(u), v)
-        u .= f
-        return u
-    elseif size(v) == gsize
+    if size(v) == gsize
         f = partition_global_array(architecture(u), v, size(u))
         u .= f
         return u
     else
-        throw(ArgumentError("ERROR: DimensionMismatch: array could not be set to match destination field"))
+        try
+            f = arch_array(architecture(u), v)
+            u .= f
+            return u
+    
+        catch
+            throw(ArgumentError("ERROR: DimensionMismatch: array could not be set to match destination field"))
+        end
     end
 end
 
diff --git a/src/DistributedComputations/distributed_grids.jl b/src/DistributedComputations/distributed_grids.jl
index 6cd12eeb95..90a3d4b80a 100644
--- a/src/DistributedComputations/distributed_grids.jl
+++ b/src/DistributedComputations/distributed_grids.jl
@@ -41,8 +41,9 @@ end
 @inline local_sizes(N, R::Fractional) = Tuple(ceil(Int, N * r) for r in R.sizes)
 @inline function local_sizes(N, R::Sizes)
     if N != sum(R.sizes)
-        @warn "The domain size specified in the architecture $(R.sizes) is inconsistent 
-               with the grid size $N: using the architecture-specified size"
+        @warn "The Sizes specified in the architecture $(R.sizes) is inconsistent  
+               with the grid size: (N = $N != sum(Sizes) = $(sum(R.sizes))). 
+               Using $(R.sizes)..."
     end
     return R.sizes
 end
@@ -80,9 +81,9 @@ function RectilinearGrid(arch::Distributed,
     TY = insert_connected_topology(topology[2], Ry, rj)
     TZ = insert_connected_topology(topology[3], Rz, rk)
     
-    xl = partition(x, nx, arch, 1)
-    yl = partition(y, ny, arch, 2)
-    zl = partition(z, nz, arch, 3)
+    xl = Rx == 1 ? x : partition_coordinate(x, nx, arch, 1)
+    yl = Ry == 1 ? y : partition_coordinate(y, ny, arch, 2)
+    zl = Rz == 1 ? z : partition_coordinate(z, nz, arch, 3)
 
     Lx, xᶠᵃᵃ, xᶜᵃᵃ, Δxᶠᵃᵃ, Δxᶜᵃᵃ = generate_coordinate(FT, topology[1](), nx, Hx, xl, :x, child_architecture(arch))
     Ly, yᵃᶠᵃ, yᵃᶜᵃ, Δyᵃᶠᵃ, Δyᵃᶜᵃ = generate_coordinate(FT, topology[2](), ny, Hy, yl, :y, child_architecture(arch))
@@ -127,9 +128,9 @@ function LatitudeLongitudeGrid(arch::Distributed,
     TY = insert_connected_topology(topology[2], Ry, rj)
     TZ = insert_connected_topology(topology[3], Rz, rk)
 
-    λl = partition(longitude, nλ, arch, 1)
-    φl = partition(latitude,  nφ, arch, 2)
-    zl = partition(z,         nz, arch, 3)
+    λl = Rx == 1 ? longitude : partition_coordinate(longitude, nλ, arch, 1)
+    φl = Ry == 1 ? latitude  : partition_coordinate(latitude,  nφ, arch, 2)
+    zl = Rz == 1 ? z         : partition_coordinate(z,         nz, arch, 3)
 
     # Calculate all direction (which might be stretched)
     # A direction is regular if the domain passed is a Tuple{<:Real, <:Real}, 
@@ -186,9 +187,9 @@ function reconstruct_global_grid(grid::DistributedRectilinearGrid)
     z = cpu_face_constructor_z(grid)
 
     ## This will not work with 3D parallelizations!!
-    xG = Rx == 1 ? x : assemble(x, nx, Rx, ri, rj, rk, arch.communicator)
-    yG = Ry == 1 ? y : assemble(y, ny, Ry, rj, ri, rk, arch.communicator)
-    zG = Rz == 1 ? z : assemble(z, nz, Rz, rk, ri, rj, arch.communicator)
+    xG = Rx == 1 ? x : assemble_coordinate(x, nx, Rx, ri, rj, rk, arch.communicator)
+    yG = Ry == 1 ? y : assemble_coordinate(y, ny, Ry, rj, ri, rk, arch.communicator)
+    zG = Rz == 1 ? z : assemble_coordinate(z, nz, Rz, rk, ri, rj, arch.communicator)
 
     child_arch = child_architecture(arch)
 
@@ -229,9 +230,9 @@ function reconstruct_global_grid(grid::DistributedLatitudeLongitudeGrid)
     z = cpu_face_constructor_z(grid)
 
     ## This will not work with 3D parallelizations!!
-    λG = Rx == 1 ? λ : assemble(λ, nλ, Rx, ri, rj, rk, arch.communicator)
-    φG = Ry == 1 ? φ : assemble(φ, nφ, Ry, rj, ri, rk, arch.communicator)
-    zG = Rz == 1 ? z : assemble(z, nz, Rz, rk, ri, rj, arch.communicator)
+    λG = Rx == 1 ? λ : assemble_coordinate(λ, nλ, Rx, ri, rj, rk, arch.communicator)
+    φG = Ry == 1 ? φ : assemble_coordinate(φ, nφ, Ry, rj, ri, rk, arch.communicator)
+    zG = Rz == 1 ? z : assemble_coordinate(z, nz, Rz, rk, ri, rj, arch.communicator)
 
     child_arch = child_architecture(arch)
 
@@ -340,4 +341,4 @@ function reconstruct_global_topology(T, R, r, r1, r2, comm)
     else
         return Bounded
     end
-end
+end
\ No newline at end of file
diff --git a/src/DistributedComputations/halo_communication.jl b/src/DistributedComputations/halo_communication.jl
index 87865c75ab..06264dd39c 100644
--- a/src/DistributedComputations/halo_communication.jl
+++ b/src/DistributedComputations/halo_communication.jl
@@ -142,11 +142,10 @@ end
 # corner passing routine
 function fill_corners!(c, connectivity, indices, loc, arch, grid, buffers, args...; async = false, only_local_halos = false, kwargs...)
     
-    if only_local_halos # No corner filling needed!
-        return nothing
-    end
+    # No corner filling needed!
+    only_local_halos && return nothing
 
-    # This has to be synchronized!!
+    # This has to be synchronized!
     fill_send_buffers!(c, buffers, grid, Val(:corners))
     sync_device!(arch)
 
@@ -189,7 +188,6 @@ function fill_halo_event!(c, fill_halos!, bcs, indices, loc, arch, grid::Distrib
 
     if !only_local_halos # Then we need to fill the `send` buffers
         fill_send_buffers!(c, buffers, grid, Val(buffer_side))
-        sync_device!(arch)
     end
 
     # Calculate size and offset of the fill_halo kernel
@@ -201,7 +199,7 @@ function fill_halo_event!(c, fill_halos!, bcs, indices, loc, arch, grid::Distrib
     requests = fill_halos!(c, bcs..., size, offset, loc, arch, grid, buffers, args...; only_local_halos, kwargs...)
 
     pool_requests_or_complete_comm!(c, arch, grid, buffers, requests, async, buffer_side)
-
+    
     return nothing
 end
 
@@ -245,6 +243,8 @@ for (side, opposite_side) in zip([:west, :south], [:east, :north])
                                   grid::DistributedGrid, buffers, args...; only_local_halos = false, kwargs...)
 
             only_local_halos && return nothing
+
+            sync_device!(arch)
                         
             @assert bc_side.condition.from == bc_opposite_side.condition.from  # Extra protection in case of bugs
             local_rank = bc_side.condition.from
@@ -275,11 +275,12 @@ for side in [:west, :east, :south, :north]
 
             only_local_halos && return nothing
 
+            sync_device!(arch)
+
             child_arch = child_architecture(arch)
             local_rank = bc_side.condition.from
 
             recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc, local_rank, bc_side.condition.to, buffers)
-
             send_req = $send_side_halo(c, grid, arch, loc, local_rank, bc_side.condition.to, buffers)
 
             return [send_req, recv_req]
diff --git a/src/DistributedComputations/partition_assemble.jl b/src/DistributedComputations/partition_assemble.jl
index 1010c0f6bf..c8b90e3cbb 100644
--- a/src/DistributedComputations/partition_assemble.jl
+++ b/src/DistributedComputations/partition_assemble.jl
@@ -37,14 +37,19 @@ end
 
 # Partitioning (localization of global objects) and assembly (global assembly of local objects)
 # Used for grid constructors (cpu_face_constructor_x, cpu_face_constructor_y, cpu_face_constructor_z)
-# which means that we need to repeat the value at the right boundary
-function partition(c::AbstractVector, n, arch, idx)
+# We need to repeat the value at the right boundary
+function partition_coordinate(c::AbstractVector, n, arch, idx)
     nl = concatenate_local_sizes(n, arch, idx)
     r  = arch.local_index[idx]
-    return c[1 + sum(nl[1:r-1]) : sum(nl[1:r])]
+    # Allow for Face values
+    if r == arch.ranks[idx]
+        return c[1 + sum(nl[1:r-1]) : end]
+    else
+        return c[1 + sum(nl[1:r-1]) : sum(nl[1:r])]
+    end
 end
 
-function partition(c::Tuple, n, arch, idx)
+function partition_coordinate(c::Tuple, n, arch, idx)
     nl = concatenate_local_sizes(n, arch, idx)
     N  = sum(nl)
     R  = arch.ranks[idx]
@@ -60,14 +65,14 @@ function partition(c::Tuple, n, arch, idx)
 end
 
 """
-    assemble(c::AbstractVector, n, R, r, r1, r2, comm) 
+    assemble_coordinate(c::AbstractVector, n, R, r, r1, r2, comm) 
 
 Builds a linear global coordinate vector given a local coordinate vector `c_local`
 a local number of elements `Nc`, number of ranks `Nr`, rank `r`,
 and `arch`itecture. Since we use a global reduction, only ranks at positions
 1 in the other two directions `r1 == 1` and `r2 == 1` fill the 1D array.
 """
-function assemble(c_local::AbstractVector, n, R, r, r1, r2, comm) 
+function assemble_coordinate(c_local::AbstractVector, n, R, r, r1, r2, comm) 
     nl = concatenate_local_sizes(n, R, r)
 
     c_global = zeros(eltype(c_local), sum(nl)+1)
@@ -83,7 +88,7 @@ function assemble(c_local::AbstractVector, n, R, r, r1, r2, comm)
 end
 
 # Simple case, just take the first and the last core
-function assemble(c::Tuple, n, R, r, r1, r2, comm) 
+function assemble_coordinate(c::Tuple, n, R, r, r1, r2, comm) 
     c_global = zeros(Float64, 2)
 
     if r == 1 && r1 == 1 && r2 == 1
diff --git a/src/Grids/grid_generation.jl b/src/Grids/grid_generation.jl
index b1480aa1f3..98bd421d43 100644
--- a/src/Grids/grid_generation.jl
+++ b/src/Grids/grid_generation.jl
@@ -13,6 +13,7 @@ get_face_node(coord::Function, i) = coord(i)
 get_face_node(coord::AbstractVector, i) = CUDA.@allowscalar coord[i]
 
 const AT = AbstractTopology
+
 lower_exterior_Δcoordᶠ(::AT,              Fi, Hcoord) = [Fi[end - Hcoord + i] - Fi[end - Hcoord + i - 1] for i = 1:Hcoord]
 lower_exterior_Δcoordᶠ(::BoundedTopology, Fi, Hcoord) = [Fi[2]  - Fi[1] for _ = 1:Hcoord]
 
diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index 49d49d3934..f13bdabb4b 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -102,18 +102,19 @@ abstract type AbstractImmersedBoundary end
 ##### ImmersedBoundaryGrid
 #####
 
-struct ImmersedBoundaryGrid{FT, TX, TY, TZ, G, I, M, Arch} <: AbstractGrid{FT, TX, TY, TZ, Arch}
+struct ImmersedBoundaryGrid{FT, TX, TY, TZ, G, I, M, S, Arch} <: AbstractGrid{FT, TX, TY, TZ, Arch}
     architecture :: Arch
     underlying_grid :: G
     immersed_boundary :: I
     interior_active_cells :: M
-    
+    active_z_columns :: S
+
     # Internal interface
-    function ImmersedBoundaryGrid{TX, TY, TZ}(grid::G, ib::I, mi::M) where {TX, TY, TZ, G <: AbstractUnderlyingGrid, I, M}
+    function ImmersedBoundaryGrid{TX, TY, TZ}(grid::G, ib::I, mi::M, ms::S) where {TX, TY, TZ, G <: AbstractUnderlyingGrid, I, M, S}
         FT = eltype(grid)
         arch = architecture(grid)
         Arch = typeof(arch)
-        return new{FT, TX, TY, TZ, G, I, M, Arch}(arch, grid, ib, mi)
+        return new{FT, TX, TY, TZ, G, I, M, S, Arch}(arch, grid, ib, mi, ms)
     end
 
     # Constructor with no active map
@@ -121,7 +122,7 @@ struct ImmersedBoundaryGrid{FT, TX, TY, TZ, G, I, M, Arch} <: AbstractGrid{FT, T
         FT = eltype(grid)
         arch = architecture(grid)
         Arch = typeof(arch)
-        return new{FT, TX, TY, TZ, G, I, Nothing, Arch}(arch, grid, ib, nothing)
+        return new{FT, TX, TY, TZ, G, I, Nothing, Nothing, Arch}(arch, grid, ib, nothing, nothing)
     end
 end
 
@@ -132,7 +133,7 @@ const IBG = ImmersedBoundaryGrid
 @inline get_ibg_property(ibg::IBG, ::Val{:immersed_boundary})      = getfield(ibg, :immersed_boundary)
 @inline get_ibg_property(ibg::IBG, ::Val{:underlying_grid})        = getfield(ibg, :underlying_grid)
 @inline get_ibg_property(ibg::IBG, ::Val{:interior_active_cells})  = getfield(ibg, :interior_active_cells)
-@inline get_ibg_property(ibg::IBG, ::Val{:surface_active_cells})   = getfield(ibg, :surface_active_cells)
+@inline get_ibg_property(ibg::IBG, ::Val{:active_z_columns})   = getfield(ibg, :active_z_columns)
 
 @inline architecture(ibg::IBG) = architecture(ibg.underlying_grid)
 
@@ -141,7 +142,10 @@ const IBG = ImmersedBoundaryGrid
 @inline z_domain(ibg::IBG) = z_domain(ibg.underlying_grid)
 
 Adapt.adapt_structure(to, ibg::IBG{FT, TX, TY, TZ}) where {FT, TX, TY, TZ} =
-    ImmersedBoundaryGrid{TX, TY, TZ}(adapt(to, ibg.underlying_grid), adapt(to, ibg.immersed_boundary), adapt(to, ibg.interior_active_cells))
+    ImmersedBoundaryGrid{TX, TY, TZ}(adapt(to, ibg.underlying_grid), 
+                                     adapt(to, ibg.immersed_boundary), 
+                                     adapt(to, ibg.interior_active_cells), 
+                                     adapt(to, ibg.active_z_columns))
 
 with_halo(halo, ibg::ImmersedBoundaryGrid) =
     ImmersedBoundaryGrid(with_halo(halo, ibg.underlying_grid), ibg.immersed_boundary)
diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index f3ce915df7..dbb4869368 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -1,26 +1,92 @@
 using Oceananigans
+using Oceananigans.Utils
 using Oceananigans.Grids: AbstractGrid
 
 using KernelAbstractions: @kernel, @index
 
-import Oceananigans.Utils: active_cells_work_layout, 
-                           use_only_active_interior_cells
+import Oceananigans.Utils: active_cells_work_layout
 
-const ActiveCellsIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractArray}
+using Oceananigans.Solvers: solve_batched_tridiagonal_system_z!, ZDirection
+using Oceananigans.DistributedComputations: DistributedGrid
 
-struct InteriorMap end
-struct SurfaceMap end
+import Oceananigans.Solvers: solve_batched_tridiagonal_system_kernel!
 
-@inline use_only_active_interior_cells(grid::ActiveCellsIBG) = InteriorMap()
+const DistributedActiveCellsIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:DistributedGrid, <:Any, <:NamedTuple} # Cannot be used to dispatch in kernels!!!
+const ArrayActiveCellsIBG       = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractArray}
+const NamedTupleActiveCellsIBG  = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:NamedTuple}
 
-@inline use_only_active_surface_cells(grid::AbstractGrid)   = nothing
-@inline use_only_active_surface_cells(grid::ActiveCellsIBG) = SurfaceMap()
+"""
+A constant representing an immersed boundary grid, where interior active cells are mapped to linear indices in grid.interior_active_cells
+"""
+const ActiveCellsIBG = Union{DistributedActiveCellsIBG, ArrayActiveCellsIBG, NamedTupleActiveCellsIBG}
 
-@inline active_cells_work_layout(group, size, ::InteriorMap, grid::ActiveCellsIBG) = min(length(grid.interior_active_cells), 256), length(grid.interior_active_cells)
-@inline active_cells_work_layout(group, size, ::SurfaceMap,  grid::ActiveCellsIBG) = min(length(grid.surface_active_cells),  256), length(grid.surface_active_cells)
+"""
+A constant representing an immersed boundary grid, where active columns in the Z-direction are mapped to linear indices in grid.active_z_columns
+"""
+const ActiveZColumnsIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractArray}
 
-@inline active_linear_index_to_interior_tuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.interior_active_cells[idx])
-@inline  active_linear_index_to_surface_tuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.surface_active_cells[idx])
+struct InteriorMap end
+struct ZColumnMap end
+
+struct WestMap  end
+struct EastMap  end
+struct SouthMap end
+struct NorthMap end
+
+@inline active_surface_map(::AbstractGrid)      = nothing
+@inline active_surface_map(::ActiveZColumnsIBG) = ZColumnMap()
+
+@inline active_interior_map(::Val{:west})  = WestMap()
+@inline active_interior_map(::Val{:east})  = EastMap()
+@inline active_interior_map(::Val{:south}) = SouthMap()
+@inline active_interior_map(::Val{:north}) = NorthMap()
+
+@inline active_interior_map(::AbstractGrid)              = nothing
+@inline active_interior_map(::ActiveCellsIBG)            = InteriorMap()
+@inline active_interior_map(::DistributedActiveCellsIBG) = InteriorMap()
+
+"""
+    active_cells_work_layout(group, size, map_type, grid)
+
+Compute the work layout for active cells based on the given map type and grid.
+
+# Arguments
+- `group`: The previous workgroup.
+- `size`: The previous worksize.
+- `map_type`: The type of map (e.g., `InteriorMap`, `WestMap`, `EastMap`, `SouthMap`, `NorthMap`).
+- `grid`: The grid containing the active cells.
+
+# Returns
+- A tuple `(workgroup, worksize)` representing the work layout for active cells.
+"""
+@inline active_cells_work_layout(group, size, ::InteriorMap, grid::ArrayActiveCellsIBG)      = min(length(grid.interior_active_cells), 256),          length(grid.interior_active_cells)
+@inline active_cells_work_layout(group, size, ::InteriorMap, grid::NamedTupleActiveCellsIBG) = min(length(grid.interior_active_cells.interior), 256), length(grid.interior_active_cells.interior)
+@inline active_cells_work_layout(group, size, ::WestMap,     grid::NamedTupleActiveCellsIBG) = min(length(grid.interior_active_cells.west),     256), length(grid.interior_active_cells.west)
+@inline active_cells_work_layout(group, size, ::EastMap,     grid::NamedTupleActiveCellsIBG) = min(length(grid.interior_active_cells.east),     256), length(grid.interior_active_cells.east)
+@inline active_cells_work_layout(group, size, ::SouthMap,    grid::NamedTupleActiveCellsIBG) = min(length(grid.interior_active_cells.south),    256), length(grid.interior_active_cells.south)
+@inline active_cells_work_layout(group, size, ::NorthMap,    grid::NamedTupleActiveCellsIBG) = min(length(grid.interior_active_cells.north),    256), length(grid.interior_active_cells.north)
+@inline active_cells_work_layout(group, size, ::ZColumnMap,  grid::ActiveZColumnsIBG)        = min(length(grid.active_z_columns),  256),              length(grid.active_z_columns)
+
+"""
+    active_linear_index_to_tuple(idx, map, grid)
+
+Converts a linear index to a tuple of indices based on the given map and grid.
+
+# Arguments
+- `idx`: The linear index to convert.
+- `map`: The map indicating the type of index conversion to perform.
+- `grid`: The grid containing the active cells.
+
+# Returns
+A tuple of indices corresponding to the linear index.
+"""
+@inline active_linear_index_to_tuple(idx, ::InteriorMap, grid::ArrayActiveCellsIBG)      = Base.map(Int, grid.interior_active_cells[idx])
+@inline active_linear_index_to_tuple(idx, ::InteriorMap, grid::NamedTupleActiveCellsIBG) = Base.map(Int, grid.interior_active_cells.interior[idx])
+@inline active_linear_index_to_tuple(idx, ::WestMap,     grid::NamedTupleActiveCellsIBG) = Base.map(Int, grid.interior_active_cells.west[idx])
+@inline active_linear_index_to_tuple(idx, ::EastMap,     grid::NamedTupleActiveCellsIBG) = Base.map(Int, grid.interior_active_cells.east[idx])
+@inline active_linear_index_to_tuple(idx, ::SouthMap,    grid::NamedTupleActiveCellsIBG) = Base.map(Int, grid.interior_active_cells.south[idx])
+@inline active_linear_index_to_tuple(idx, ::NorthMap,    grid::NamedTupleActiveCellsIBG) = Base.map(Int, grid.interior_active_cells.north[idx])
+@inline active_linear_index_to_tuple(idx, ::ZColumnMap,  grid::ActiveZColumnsIBG)         = Base.map(Int, grid.active_z_columns[idx])
 
 function ImmersedBoundaryGrid(grid, ib; active_cells_map::Bool = true) 
 
@@ -29,30 +95,38 @@ function ImmersedBoundaryGrid(grid, ib; active_cells_map::Bool = true)
     
     # Create the cells map on the CPU, then switch it to the GPU
     if active_cells_map 
-        map_interior = active_cells_map_interior(ibg)
-        map_interior = arch_array(architecture(ibg), map_interior)
-        # map_surface = active_cells_map_surface(ibg)
-        # map_surface = arch_array(architecture(ibg), map_surface)
+        interior_map = map_interior_active_cells(ibg)
+        column_map   = map_active_z_columns(ibg)
     else
-        map_interior = nothing
+        interior_map = nothing
+        column_map  = nothing
     end
 
     return ImmersedBoundaryGrid{TX, TY, TZ}(ibg.underlying_grid, 
                                             ibg.immersed_boundary, 
-                                            map_interior)
+                                            interior_map,
+                                            column_map)
 end
 
+with_halo(halo, ibg::ActiveCellsIBG) =
+    ImmersedBoundaryGrid(with_halo(halo, ibg.underlying_grid), ibg.immersed_boundary; active_cells_map = true)
+
 @inline active_cell(i, j, k, ibg) = !immersed_cell(i, j, k, ibg)
 @inline active_column(i, j, k, grid, column) = column[i, j, k] != 0
 
-function compute_interior_active_cells(ibg)
-    is_immersed_operation = KernelFunctionOperation{Center, Center, Center}(active_cell, ibg)
+@kernel function _set_active_indices!(active_cells_field, grid)
+    i, j, k = @index(Global, NTuple)
+    @inbounds active_cells_field[i, j, k] = active_cell(i, j, k, grid)
+end
+
+function compute_interior_active_cells(ibg; parameters = :xyz)
     active_cells_field = Field{Center, Center, Center}(ibg, Bool)
-    set!(active_cells_field, is_immersed_operation)
+    fill!(active_cells_field, false)
+    launch!(architecture(ibg), ibg, parameters, _set_active_indices!, active_cells_field, ibg)
     return active_cells_field
 end
 
-function compute_surface_active_cells(ibg)
+function compute_active_z_columns(ibg)
     one_field = ConditionalOperation{Center, Center, Center}(OneField(Int), identity, ibg, NotImmersed(truefunc), 0)
     column    = sum(one_field, dims = 3)
     is_immersed_column = KernelFunctionOperation{Center, Center, Nothing}(active_column, ibg, column)
@@ -65,8 +139,20 @@ const MAXUInt8  = 2^8  - 1
 const MAXUInt16 = 2^16 - 1
 const MAXUInt32 = 2^32 - 1
 
-function active_cells_map_interior(ibg)
-    active_cells_field = compute_interior_active_cells(ibg)
+"""
+    interior_active_indices(ibg; parameters = :xyz)
+
+Compute the indices of the active interior cells in the given immersed boundary grid.
+
+# Arguments
+- `ibg`: The immersed boundary grid.
+- `parameters`: (optional) The parameters to be used for computing the active cells. Default is `:xyz`.
+
+# Returns
+An array of tuples representing the indices of the active interior cells.
+"""
+function interior_active_indices(ibg; parameters = :xyz)
+    active_cells_field = compute_interior_active_cells(ibg; parameters)
     
     N = maximum(size(ibg))
     IntType = N > MAXUInt8 ? (N > MAXUInt16 ? (N > MAXUInt32 ? UInt64 : UInt32) : UInt16) : UInt8
@@ -76,6 +162,7 @@ function active_cells_map_interior(ibg)
     # Cannot findall on the entire field because we incur on OOM errors
     active_indices = IndicesType[]
     active_indices = findall_active_indices!(active_indices, active_cells_field, ibg, IndicesType)
+    active_indices = arch_array(architecture(ibg), active_indices)
 
     return active_indices
 end
@@ -103,19 +190,60 @@ end
 
 @inline add_3rd_index(t::Tuple, k) = (t[1], t[2], k) 
 
+map_interior_active_cells(ibg) = interior_active_indices(ibg; parameters = :xyz)
+
+# In case of a `DistributedGrid` we want to have different maps depending on the 
+# partitioning of the domain
+function map_interior_active_cells(ibg::ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:DistributedGrid})
+
+    arch = architecture(ibg)
+    Rx, Ry, _  = arch.ranks
+    Tx, Ty, _  = topology(ibg)
+    Nx, Ny, Nz = size(ibg)
+    Hx, Hy, _  = halo_size(ibg)
+    
+    x_boundary = (Hx, Ny, Nz)
+    y_boundary = (Nx, Hy, Nz)
+         
+    left_offsets    = (0,  0,  0)
+    right_x_offsets = (Nx-Hx, 0,     0)
+    right_y_offsets = (0,     Ny-Hy, 0)
+
+    include_west  = !isa(ibg, XFlatGrid) && (Rx != 1) && !(Tx == RightConnected)
+    include_east  = !isa(ibg, XFlatGrid) && (Rx != 1) && !(Tx == LeftConnected)
+    include_south = !isa(ibg, YFlatGrid) && (Ry != 1) && !(Ty == RightConnected)
+    include_north = !isa(ibg, YFlatGrid) && (Ry != 1) && !(Ty == LeftConnected)
+
+    west  = include_west  ? interior_active_indices(ibg; parameters = KernelParameters(x_boundary, left_offsets))    : nothing
+    east  = include_east  ? interior_active_indices(ibg; parameters = KernelParameters(x_boundary, right_x_offsets)) : nothing
+    south = include_south ? interior_active_indices(ibg; parameters = KernelParameters(y_boundary, left_offsets))    : nothing
+    north = include_north ? interior_active_indices(ibg; parameters = KernelParameters(y_boundary, right_y_offsets)) : nothing
+    
+    nx = Rx == 1 ? Nx : (Tx == RightConnected || Tx == LeftConnected ? Nx - Hx : Nx - 2Hx)
+    ny = Ry == 1 ? Ny : (Ty == RightConnected || Ty == LeftConnected ? Ny - Hy : Ny - 2Hy)
+
+    ox = Rx == 1 || Tx == RightConnected ? 0 : Hx
+    oy = Ry == 1 || Ty == RightConnected ? 0 : Hy
+     
+    interior = interior_active_indices(ibg; parameters = KernelParameters((nx, ny, Nz), (ox, oy, 0)))
+
+    return (; interior, west, east, south, north)
+end
+
 # If we eventually want to perform also barotropic step, `w` computation and `p` 
 # computation only on active `columns`
-function active_cells_map_surface(ibg)
-    active_cells_field = compute_surface_active_cells(ibg)
+function map_active_z_columns(ibg)
+    active_cells_field = compute_active_z_columns(ibg)
     interior_cells     = arch_array(CPU(), interior(active_cells_field, :, :, 1))
   
     full_indices = findall(interior_cells)
 
-    Nx, Ny, Nz = size(ibg)
+    Nx, Ny, _ = size(ibg)
     # Reduce the size of the active_cells_map (originally a tuple of Int64)
     N = max(Nx, Ny)
     IntType = N > MAXUInt8 ? (N > MAXUInt16 ? (N > MAXUInt32 ? UInt64 : UInt32) : UInt16) : UInt8
-    smaller_indices = getproperty.(full_indices, Ref(:I)) .|> Tuple{IntType, IntType}
-    
-    return smaller_indices
+    surface_map = getproperty.(full_indices, Ref(:I)) .|> Tuple{IntType, IntType}
+    surface_map = arch_array(architecture(ibg), surface_map)
+
+    return surface_map
 end
diff --git a/src/ImmersedBoundaries/grid_fitted_bottom.jl b/src/ImmersedBoundaries/grid_fitted_bottom.jl
index 8bd1d7de48..e693e20e2d 100644
--- a/src/ImmersedBoundaries/grid_fitted_bottom.jl
+++ b/src/ImmersedBoundaries/grid_fitted_bottom.jl
@@ -102,6 +102,6 @@ function on_architecture(arch, ib::GridFittedBottom{<:Field})
     return GridFittedBottom(new_bottom_height, ib.immersed_condition)
 end
 
-Adapt.adapt_structure(to, ib::GridFittedBottom) = GridFittedBottom(adapt(to, ib.bottom_height.data),
+Adapt.adapt_structure(to, ib::GridFittedBottom) = GridFittedBottom(adapt(to, ib.bottom_height),
                                                                              ib.immersed_condition)
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl
index ab510f7c76..9e2715dcff 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl
@@ -8,6 +8,8 @@ using Oceananigans.Models.NonhydrostaticModels: boundary_tendency_kernel_paramet
 
 import Oceananigans.Models: compute_boundary_tendencies!
 
+using Oceananigans.ImmersedBoundaries: active_interior_map, DistributedActiveCellsIBG
+
 # We assume here that top/bottom BC are always synched (no partitioning in z)
 function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     grid = model.grid
@@ -21,8 +23,30 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     compute_auxiliaries!(model; w_parameters, p_parameters, κ_parameters)
 
     # parameters for communicating North / South / East / West side
+    compute_boundary_tendency_contributions!(grid, arch, model)
+
+    return nothing
+end
+
+function compute_boundary_tendency_contributions!(grid, arch, model) 
     kernel_parameters = boundary_tendency_kernel_parameters(grid, arch)
     compute_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters)
+    return nothing
+end
+
+function compute_boundary_tendency_contributions!(grid::DistributedActiveCellsIBG, arch, model)
+    maps = grid.interior_active_cells
+    
+    for (name, map) in zip(keys(maps), maps)
+        compute_boundary = (name != :interior) && !isnothing(map) 
+        
+        # If there exists a boundary map, then we compute the boundary contributions
+        if compute_boundary
+            active_boundary_map = active_interior_map(Val(name))
+            compute_hydrostatic_free_surface_tendency_contributions!(model, tuple(:xyz); 
+                                                                     active_cells_map = active_boundary_map)
+        end
+    end
 
     return nothing
 end
@@ -37,8 +61,8 @@ function boundary_w_kernel_parameters(grid, arch)
              
     # Offsets in tangential direction are == -1 to
     # cover the required corners
-    Oxᴸ = (-Hx, -1)
-    Oyᴸ = (-1, -Hy)
+    Oxᴸ = (-Hx+1, -1)
+    Oyᴸ = (-1, -Hy+1)
     Oxᴿ = (Nx-1, -1)
     Oyᴿ = (-1, Ny-1)
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
index 447d0efdf3..dc330fa1fd 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
@@ -13,8 +13,8 @@ import Oceananigans: tracer_tendency_kernel_function
 import Oceananigans.Models: complete_communication_and_compute_boundary!
 import Oceananigans.Models: interior_tendency_kernel_parameters
 
-using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, 
-                                       InteriorMap, active_linear_index_to_interior_tuple
+using Oceananigans.ImmersedBoundaries: active_interior_map, ActiveCellsIBG, 
+                                       InteriorMap, active_linear_index_to_tuple
 
 """
     compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
@@ -29,7 +29,7 @@ function compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
     # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
     # interior of the domain
     compute_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters;
-                                                             only_active_cells = use_only_active_interior_cells(model.grid))
+                                                             active_cells_map = active_interior_map(model.grid))
 
     complete_communication_and_compute_boundary!(model, model.grid, model.architecture)
 
@@ -80,12 +80,12 @@ end
 end
 
 """ Store previous value of the source term and compute current source term. """
-function compute_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters; only_active_cells = nothing)
+function compute_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters; active_cells_map = nothing)
 
     arch = model.architecture
     grid = model.grid
 
-    compute_hydrostatic_momentum_tendencies!(model, model.velocities, kernel_parameters; only_active_cells)
+    compute_hydrostatic_momentum_tendencies!(model, model.velocities, kernel_parameters; active_cells_map)
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
 
@@ -121,9 +121,9 @@ function compute_hydrostatic_free_surface_tendency_contributions!(model, kernel_
                     tendency_kernel!,
                     c_tendency,
                     grid,
-                    only_active_cells,
+                    active_cells_map,
                     args;
-                    only_active_cells)
+                    active_cells_map)
         end
     end
 
@@ -160,7 +160,7 @@ function compute_free_surface_tendency!(grid, model, kernel_parameters)
 end
 
 """ Calculate momentum tendencies if momentum is not prescribed."""
-function compute_hydrostatic_momentum_tendencies!(model, velocities, kernel_parameters; only_active_cells = nothing)
+function compute_hydrostatic_momentum_tendencies!(model, velocities, kernel_parameters; active_cells_map = nothing)
 
     grid = model.grid
     arch = architecture(grid)
@@ -184,17 +184,17 @@ function compute_hydrostatic_momentum_tendencies!(model, velocities, kernel_para
 
     u_kernel_args = tuple(start_momentum_kernel_args..., u_immersed_bc, end_momentum_kernel_args...)
     v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args...)
-        
+
     for parameters in kernel_parameters
         launch!(arch, grid, parameters,
                 compute_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, grid, 
-                only_active_cells, u_kernel_args;
-                only_active_cells)
+                active_cells_map, u_kernel_args;
+                active_cells_map)
 
         launch!(arch, grid, parameters,
                 compute_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, grid, 
-                only_active_cells, v_kernel_args;
-                only_active_cells)
+                active_cells_map, v_kernel_args;
+                active_cells_map)
     end
 
     compute_free_surface_tendency!(grid, model, :xy)
@@ -228,26 +228,26 @@ end
 #####
 
 """ Calculate the right-hand-side of the u-velocity equation. """
-@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid, interior_map, args)
+@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid, map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid::ActiveCellsIBG, ::InteriorMap, args)
+@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid::ActiveCellsIBG, map, args)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
+    i, j, k = active_linear_index_to_tuple(idx, map, grid)
     @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the v-velocity equation. """
-@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid, interior_map, args)
+@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid, map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid::ActiveCellsIBG, ::InteriorMap, args)
+@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid::ActiveCellsIBG, map, args)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
+    i, j, k = active_linear_index_to_tuple(idx, map, grid)
     @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
 end
 
@@ -256,26 +256,26 @@ end
 #####
 
 """ Calculate the right-hand-side of the tracer advection-diffusion equation. """
-@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid, interior_map, args)
+@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid, map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gc[i, j, k] = hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid::ActiveCellsIBG, ::InteriorMap, args)
+@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid::ActiveCellsIBG, map, args)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
+    i, j, k = active_linear_index_to_tuple(idx, map, grid)
     @inbounds Gc[i, j, k] = hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the subgrid scale energy equation. """
-@kernel function compute_hydrostatic_free_surface_Ge!(Ge, grid, interior_map, args)
+@kernel function compute_hydrostatic_free_surface_Ge!(Ge, grid, map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Ge[i, j, k] = hydrostatic_turbulent_kinetic_energy_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_hydrostatic_free_surface_Ge!(Ge, grid::ActiveCellsIBG, ::InteriorMap, args)
+@kernel function compute_hydrostatic_free_surface_Ge!(Ge, grid::ActiveCellsIBG, map, args)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
+    i, j, k = active_linear_index_to_tuple(idx, map, grid)
     @inbounds Ge[i, j, k] = hydrostatic_turbulent_kinetic_energy_tendency(i, j, k, grid, args...)
 end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
index 679889a888..4f14f69f2a 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
@@ -12,14 +12,11 @@ function SplitExplicitAuxiliaryFields(grid::DistributedGrid)
     
     Hᶠᶜ = Field((Face,   Center, Nothing), grid)
     Hᶜᶠ = Field((Center, Face,   Nothing), grid)
-    Hᶜᶜ = Field((Center, Center, Nothing), grid)
-    
+
     calculate_column_height!(Hᶠᶜ, (Face, Center, Center))
     calculate_column_height!(Hᶜᶠ, (Center, Face, Center))
 
-    calculate_column_height!(Hᶜᶜ, (Center, Center, Center))
-       
-    fill_halo_regions!((Hᶠᶜ, Hᶜᶠ, Hᶜᶜ))
+    fill_halo_regions!((Hᶠᶜ, Hᶜᶠ))
 
     # In a non-parallel grid we calculate only the interior
     kernel_size    = augmented_kernel_size(grid)
@@ -27,7 +24,7 @@ function SplitExplicitAuxiliaryFields(grid::DistributedGrid)
 
     kernel_parameters = KernelParameters(kernel_size, kernel_offsets)
     
-    return SplitExplicitAuxiliaryFields(Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, Hᶜᶜ, kernel_parameters)
+    return SplitExplicitAuxiliaryFields(Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, kernel_parameters)
 end
 
 """Integrate z at locations `location` and set! `height`` with the result"""
@@ -69,14 +66,15 @@ function FreeSurface(free_surface::SplitExplicitFreeSurface, velocities, grid::D
         old_halos  = halo_size(grid)
         Nsubsteps  = length(settings.substepping.averaging_weights)
 
-        new_halos = distributed_split_explicit_halos(old_halos, Nsubsteps+1, grid)         
-        new_grid  = with_halo(new_halos, grid)
+        extended_halos = distributed_split_explicit_halos(old_halos, Nsubsteps+1, grid)         
+        extended_grid  = with_halo(extended_halos, grid)
 
-        η = ZFaceField(new_grid, indices = (:, :, size(new_grid, 3)+1))
+        Nze = size(extended_grid, 3)
+        η = ZFaceField(extended_grid, indices = (:, :, Nze+1))
 
         return SplitExplicitFreeSurface(η,
-                                        SplitExplicitState(new_grid),
-                                        SplitExplicitAuxiliaryFields(new_grid),
+                                        SplitExplicitState(extended_grid, settings.timestepper),
+                                        SplitExplicitAuxiliaryFields(extended_grid),
                                         free_surface.gravitational_acceleration,
                                         free_surface.settings)
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
index 509b1fe0ea..717bb1fd2d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
@@ -2,6 +2,8 @@ using Oceananigans.Fields: location
 using Oceananigans.TimeSteppers: ab2_step_field!
 using Oceananigans.TurbulenceClosures: implicit_step!
 
+using Oceananigans.ImmersedBoundaries: active_interior_map, active_surface_map
+
 import Oceananigans.TimeSteppers: ab2_step!
 
 #####
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
index 0604cd5de5..d32b5786d8 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
@@ -83,7 +83,7 @@ SplitExplicitFreeSurface(FT::DataType = Float64; gravitational_acceleration = g_
 function FreeSurface(free_surface::SplitExplicitFreeSurface, velocities, grid)
     η =  FreeSurfaceDisplacementField(velocities, free_surface, grid)
 
-    return SplitExplicitFreeSurface(η, SplitExplicitState(grid),
+    return SplitExplicitFreeSurface(η, SplitExplicitState(grid, free_surface.settings.timestepper),
                                     SplitExplicitAuxiliaryFields(grid),
                                     free_surface.gravitational_acceleration,
                                     free_surface.settings)
@@ -92,15 +92,12 @@ end
 function SplitExplicitFreeSurface(grid; gravitational_acceleration = g_Earth,
     settings = SplitExplicitSettings(eltype(grid); gravitational_acceleration, substeps = 200))
 
-    if eltype(settings) != eltype(grid)
-        @warn "Using $(eltype(settings)) settings for the SplitExplicitFreeSurface on a $(eltype(grid)) grid"
-    end
-    
-    η = ZFaceField(grid, indices = (:, :, size(grid, 3)+1))
+    Nz = size(grid, 3)
+    η  = ZFaceField(grid, indices = (:, :, Nz+1))
     gravitational_acceleration = convert(eltype(grid), gravitational_acceleration)
 
-    return SplitExplicitFreeSurface(η, SplitExplicitState(grid), SplitExplicitAuxiliaryFields(grid),
-                                    gravitational_acceleration, settings)
+    return SplitExplicitFreeSurface(η, SplitExplicitState(grid, settings.timestepper), SplitExplicitAuxiliaryFields(grid),
+           gravitational_acceleration, settings)
 end
 
 """
@@ -110,35 +107,35 @@ A type containing the state fields for the split-explicit free surface.
 
 $(FIELDS)
 """
-Base.@kwdef struct SplitExplicitState{𝒞𝒞, ℱ𝒞, 𝒞ℱ}
+Base.@kwdef struct SplitExplicitState{CC, ACC, FC, AFC, CF, ACF}
     "The free surface at time `m`. (`ReducedField` over ``z``)"
-    ηᵐ   :: 𝒞𝒞
+    ηᵐ   :: ACC
     "The free surface at time `m-1`. (`ReducedField` over ``z``)"
-    ηᵐ⁻¹ :: 𝒞𝒞
+    ηᵐ⁻¹ :: ACC
     "The free surface at time `m-2`. (`ReducedField` over ``z``)"
-    ηᵐ⁻² :: 𝒞𝒞
+    ηᵐ⁻² :: ACC
     "The barotropic zonal velocity at time `m`. (`ReducedField` over ``z``)"
-    U    :: ℱ𝒞
+    U    :: FC
     "The barotropic zonal velocity at time `m-1`. (`ReducedField` over ``z``)"
-    Uᵐ⁻¹ :: ℱ𝒞
+    Uᵐ⁻¹ :: AFC
     "The barotropic zonal velocity at time `m-2`. (`ReducedField` over ``z``)"
-    Uᵐ⁻² :: ℱ𝒞
+    Uᵐ⁻² :: AFC
     "The barotropic meridional velocity at time `m`. (`ReducedField` over ``z``)"
-    V    :: 𝒞ℱ
+    V    :: CF
     "The barotropic meridional velocity at time `m-1`. (`ReducedField` over ``z``)"
-    Vᵐ⁻¹ :: 𝒞ℱ
+    Vᵐ⁻¹ :: ACF
     "The barotropic meridional velocity at time `m-2`. (`ReducedField` over ``z``)"
-    Vᵐ⁻² :: 𝒞ℱ
+    Vᵐ⁻² :: ACF
     "The time-filtered free surface. (`ReducedField` over ``z``)"
-    η̅    :: 𝒞𝒞
+    η̅    :: CC
     "The time-filtered barotropic zonal velocity. (`ReducedField` over ``z``)"
-    U̅    :: ℱ𝒞
+    U̅    :: FC
     "The time-filtered barotropic meridional velocity. (`ReducedField` over ``z``)"
-    V̅    :: 𝒞ℱ
+    V̅    :: CF
 end
 
 """
-    SplitExplicitState(grid)
+    SplitExplicitState(grid, timestepper)
 
 Return the split-explicit state for `grid`.
 
@@ -146,24 +143,26 @@ Note that `η̅` is solely used for setting the `η` at the next substep iterati
 acts as a filter for `η`. Values with superscripts `m-1` and `m-2` correspond to previous stored
 time steps to allow using a higher-order time stepping scheme, e.g., `AdamsBashforth3Scheme`.
 """
-function SplitExplicitState(grid::AbstractGrid)
-    η̅ = ZFaceField(grid, indices = (:, :, size(grid, 3)+1))
+function SplitExplicitState(grid::AbstractGrid, timestepper)
+    
+    Nz = size(grid, 3)
+    
+    η̅ = ZFaceField(grid, indices = (:, :, Nz+1))
 
-    ηᵐ   = ZFaceField(grid, indices = (:, :, size(grid, 3)+1))
-    ηᵐ⁻¹ = ZFaceField(grid, indices = (:, :, size(grid, 3)+1))
-    ηᵐ⁻² = ZFaceField(grid, indices = (:, :, size(grid, 3)+1))
+    ηᵐ   = auxiliary_free_surface_field(grid, timestepper)
+    ηᵐ⁻¹ = auxiliary_free_surface_field(grid, timestepper)
+    ηᵐ⁻² = auxiliary_free_surface_field(grid, timestepper)
           
-    U    = Field((Face, Center, Nothing), grid)
-    V    = Field((Center, Face, Nothing), grid)
+    U    = XFaceField(grid, indices = (:, :, Nz))
+    V    = YFaceField(grid, indices = (:, :, Nz))
 
-    Uᵐ⁻¹ = Field((Face, Center, Nothing), grid)
-    Vᵐ⁻¹ = Field((Center, Face, Nothing), grid)
-          
-    Uᵐ⁻² = Field((Face, Center, Nothing), grid)
-    Vᵐ⁻² = Field((Center, Face, Nothing), grid)
+    Uᵐ⁻¹ = auxiliary_barotropic_U_field(grid, timestepper)
+    Vᵐ⁻¹ = auxiliary_barotropic_V_field(grid, timestepper)
+    Uᵐ⁻² = auxiliary_barotropic_U_field(grid, timestepper)
+    Vᵐ⁻² = auxiliary_barotropic_V_field(grid, timestepper)
           
-    U̅    = Field((Face, Center, Nothing), grid)
-    V̅    = Field((Center, Face, Nothing), grid)
+    U̅ = XFaceField(grid, indices = (:, :, Nz))
+    V̅ = YFaceField(grid, indices = (:, :, Nz))
     
     return SplitExplicitState(; ηᵐ, ηᵐ⁻¹, ηᵐ⁻², U, Uᵐ⁻¹, Uᵐ⁻², V, Vᵐ⁻¹, Vᵐ⁻², η̅, U̅, V̅)
 end
@@ -179,7 +178,7 @@ large (or `:xy` in case of a serial computation), and start computing from
 
 $(FIELDS)
 """
-Base.@kwdef struct SplitExplicitAuxiliaryFields{𝒞ℱ, ℱ𝒞, 𝒞𝒞, 𝒦}
+Base.@kwdef struct SplitExplicitAuxiliaryFields{𝒞ℱ, ℱ𝒞, 𝒦}
     "Vertically-integrated slow barotropic forcing function for `U` (`ReducedField` over ``z``)"
     Gᵁ :: ℱ𝒞
     "Vertically-integrated slow barotropic forcing function for `V` (`ReducedField` over ``z``)"
@@ -188,8 +187,6 @@ Base.@kwdef struct SplitExplicitAuxiliaryFields{𝒞ℱ, ℱ𝒞, 𝒞𝒞, 𝒦
     Hᶠᶜ :: ℱ𝒞
     "Depth at `(Center, Face)` (`ReducedField` over ``z``)"
     Hᶜᶠ :: 𝒞ℱ
-    "Depth at `(Center, Center)` (`ReducedField` over ``z``)"
-    Hᶜᶜ :: 𝒞𝒞
     "kernel size for barotropic time stepping"
     kernel_parameters :: 𝒦
 end
@@ -206,7 +203,6 @@ function SplitExplicitAuxiliaryFields(grid::AbstractGrid)
 
     Hᶠᶜ = Field((Face,   Center, Nothing), grid)
     Hᶜᶠ = Field((Center, Face,   Nothing), grid)
-    Hᶜᶜ = Field((Center, Center, Nothing), grid)
 
     dz = GridMetricOperation((Face, Center, Center), Δz, grid)
     sum!(Hᶠᶜ, dz)
@@ -214,14 +210,11 @@ function SplitExplicitAuxiliaryFields(grid::AbstractGrid)
     dz = GridMetricOperation((Center, Face, Center), Δz, grid)
     sum!(Hᶜᶠ, dz)
 
-    dz = GridMetricOperation((Center, Center, Center), Δz, grid)
-    sum!(Hᶜᶜ, dz)
-
-    fill_halo_regions!((Hᶠᶜ, Hᶜᶠ, Hᶜᶜ))
+    fill_halo_regions!((Hᶠᶜ, Hᶜᶠ))
 
     kernel_parameters = :xy
 
-    return SplitExplicitAuxiliaryFields(Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, Hᶜᶜ, kernel_parameters)
+    return SplitExplicitAuxiliaryFields(Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, kernel_parameters)
 end
 
 """
@@ -239,6 +232,15 @@ end
 struct AdamsBashforth3Scheme end
 struct ForwardBackwardScheme end
 
+
+auxiliary_free_surface_field(grid, ::AdamsBashforth3Scheme) = ZFaceField(grid, indices = (:, :, size(grid, 3)+1))
+auxiliary_free_surface_field(grid, ::ForwardBackwardScheme) = nothing
+
+auxiliary_barotropic_U_field(grid, ::AdamsBashforth3Scheme) = XFaceField(grid, indices = (:, :, size(grid, 3)))
+auxiliary_barotropic_U_field(grid, ::ForwardBackwardScheme) = nothing
+auxiliary_barotropic_V_field(grid, ::AdamsBashforth3Scheme) = YFaceField(grid, indices = (:, :, size(grid, 3)))
+auxiliary_barotropic_V_field(grid, ::ForwardBackwardScheme) = nothing
+
 # (p = 2, q = 4, r = 0.18927) minimize dispersion error from Shchepetkin and McWilliams (2005): https://doi.org/10.1016/j.ocemod.2004.08.002 
 @inline function averaging_shape_function(τ::FT; p = 2, q = 4, r = FT(0.18927)) where FT 
     τ₀ = (p + 2) * (p + q + 2) / (p + 1) / (p + q + 1)
@@ -291,7 +293,7 @@ end
     averaging_weights = averaging_weights[1:idx]
     averaging_weights ./= sum(averaging_weights)
 
-    return Δτ, averaging_weights
+    return Δτ, tuple(averaging_weights...)
 end
 
 function SplitExplicitSettings(FT::DataType=Float64;
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index b9389a0440..d9a1e67193 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -4,15 +4,19 @@ using Oceananigans.Utils
 using Oceananigans.AbstractOperations: Δz  
 using Oceananigans.BoundaryConditions
 using Oceananigans.Operators
-using Oceananigans.ImmersedBoundaries: peripheral_node, immersed_inactive_node
-using Oceananigans.ImmersedBoundaries: inactive_node, IBG, c, f
-using Oceananigans.ImmersedBoundaries: mask_immersed_field!
+using Oceananigans.Architectures: convert_args
+using Oceananigans.ImmersedBoundaries: peripheral_node, immersed_inactive_node, GFBIBG
+using Oceananigans.ImmersedBoundaries: inactive_node, IBG, c, f, ZColumnMap
+using Oceananigans.ImmersedBoundaries: mask_immersed_field!, active_surface_map, active_interior_map
+using Oceananigans.ImmersedBoundaries: active_linear_index_to_tuple, ActiveCellsIBG, ActiveZColumnsIBG
+using Oceananigans.DistributedComputations: child_architecture
+using Oceananigans.DistributedComputations: Distributed
+using Printf
 
 # constants for AB3 time stepping scheme (from https://doi.org/10.1016/j.ocemod.2004.08.002)
 const β = 0.281105
 const α = 1.5 + β
 const θ = - 0.5 - 2β
-
 const γ = 0.088
 const δ = 0.614
 const ϵ = 0.013
@@ -99,9 +103,9 @@ end
 # Time stepping extrapolation U★, and η★
 
 # AB3 step
-@inline function U★(i, j, k, grid, ::AdamsBashforth3Scheme, ϕᵐ, ϕᵐ⁻¹, ϕᵐ⁻²)
+@inline function U★(i, j, k, grid, ::AdamsBashforth3Scheme, Uᵐ, Uᵐ⁻¹, Uᵐ⁻²)
     FT = eltype(grid)
-    return @inbounds FT(α) * ϕᵐ[i, j, k] + FT(θ) * ϕᵐ⁻¹[i, j, k] + FT(β) * ϕᵐ⁻²[i, j, k]
+    return @inbounds FT(α) * Uᵐ[i, j, k] + FT(θ) * Uᵐ⁻¹[i, j, k] + FT(β) * Uᵐ⁻²[i, j, k]
 end
 
 @inline function η★(i, j, k, grid, ::AdamsBashforth3Scheme, ηᵐ⁺¹, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²)
@@ -110,7 +114,7 @@ end
 end
 
 # Forward Backward Step
-@inline U★(i, j, k, grid, ::ForwardBackwardScheme, ϕ, args...) = @inbounds ϕ[i, j, k]
+@inline U★(i, j, k, grid, ::ForwardBackwardScheme, U, args...) = @inbounds U[i, j, k]
 @inline η★(i, j, k, grid, ::ForwardBackwardScheme, η, args...) = @inbounds η[i, j, k]
 
 @inline advance_previous_velocity!(i, j, k, ::ForwardBackwardScheme, U, Uᵐ⁻¹, Uᵐ⁻²) = nothing
@@ -132,16 +136,13 @@ end
     return nothing
 end
 
-using Oceananigans.DistributedComputations: Distributed
-using Printf
-
-@kernel function split_explicit_free_surface_evolution_kernel!(grid, Δτ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², U, V, Uᵐ⁻¹, Uᵐ⁻², Vᵐ⁻¹, Vᵐ⁻², 
-                                                               η̅, U̅, V̅, averaging_weight,
-                                                               Gᵁ, Gⱽ, g, Hᶠᶜ, Hᶜᶠ,
-                                                               timestepper)
+@kernel function _split_explicit_free_surface!(grid, Δτ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², U, V, Uᵐ⁻¹, Uᵐ⁻², Vᵐ⁻¹, Vᵐ⁻², timestepper)
     i, j = @index(Global, NTuple)
-    k_top = grid.Nz+1
+    free_surface_evolution!(i, j, grid, Δτ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², U, V, Uᵐ⁻¹, Uᵐ⁻², Vᵐ⁻¹, Vᵐ⁻², timestepper)
+end
 
+@inline function free_surface_evolution!(i, j, grid, Δτ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², U, V, Uᵐ⁻¹, Uᵐ⁻², Vᵐ⁻¹, Vᵐ⁻², timestepper)
+    k_top = grid.Nz+1
     TX, TY, _ = topology(grid)
 
     @inbounds begin        
@@ -150,81 +151,100 @@ using Printf
         η[i, j, k_top] -= Δτ * (div_xᶜᶜᶠ_U(i, j, k_top-1, grid, TX, U★, timestepper, U, Uᵐ⁻¹, Uᵐ⁻²) +
                                 div_yᶜᶜᶠ_V(i, j, k_top-1, grid, TY, U★, timestepper, V, Vᵐ⁻¹, Vᵐ⁻²))
     end
+
+    return nothing
 end
 
-@kernel function split_explicit_barotropic_velocity_evolution_kernel!(grid, Δτ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², U, V, Uᵐ⁻¹, Uᵐ⁻², Vᵐ⁻¹, Vᵐ⁻²,
-                                                                      η̅, U̅, V̅, averaging_weight,
-                                                                      Gᵁ, Gⱽ, g, Hᶠᶜ, Hᶜᶠ,
-                                                                      timestepper)
+@kernel function _split_explicit_barotropic_velocity!(averaging_weight, grid, Δτ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², 
+                                                      U, Uᵐ⁻¹, Uᵐ⁻², V,  Vᵐ⁻¹, Vᵐ⁻²,
+                                                      η̅, U̅, V̅, Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, g, 
+                                                      timestepper)
     i, j = @index(Global, NTuple)
-    k_top = grid.Nz + 1
+    velocity_evolution!(i, j, grid, Δτ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², 
+                        U, Uᵐ⁻¹, Uᵐ⁻², V,  Vᵐ⁻¹, Vᵐ⁻²,
+                        η̅, U̅, V̅, averaging_weight,
+                        Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, g, 
+                        timestepper)
+end
 
+@inline function velocity_evolution!(i, j, grid, Δτ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², 
+                                     U, Uᵐ⁻¹, Uᵐ⁻², V,  Vᵐ⁻¹, Vᵐ⁻²,
+                                     η̅, U̅, V̅, averaging_weight,
+                                     Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, g, 
+                                     timestepper)
+    k_top = grid.Nz+1
+    
     TX, TY, _ = topology(grid)
 
     @inbounds begin 
-        advance_previous_velocity!(i, j, 1, timestepper, U, Uᵐ⁻¹, Uᵐ⁻²)
-        advance_previous_velocity!(i, j, 1, timestepper, V, Vᵐ⁻¹, Vᵐ⁻²)
+        advance_previous_velocity!(i, j, k_top-1, timestepper, U, Uᵐ⁻¹, Uᵐ⁻²)
+        advance_previous_velocity!(i, j, k_top-1, timestepper, V, Vᵐ⁻¹, Vᵐ⁻²)
 
         # ∂τ(U) = - ∇η + G
-        U[i, j, 1] +=  Δτ * (- g * Hᶠᶜ[i, j] * ∂xᶠᶜᶠ_η(i, j, k_top, grid, TX, η★, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²) + Gᵁ[i, j, 1])
-        V[i, j, 1] +=  Δτ * (- g * Hᶜᶠ[i, j] * ∂yᶜᶠᶠ_η(i, j, k_top, grid, TY, η★, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²) + Gⱽ[i, j, 1])
-
+        U[i, j, k_top-1] +=  Δτ * (- g * Hᶠᶜ[i, j, 1] * ∂xᶠᶜᶠ_η(i, j, k_top, grid, TX, η★, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²) + Gᵁ[i, j, k_top-1])
+        V[i, j, k_top-1] +=  Δτ * (- g * Hᶜᶠ[i, j, 1] * ∂yᶜᶠᶠ_η(i, j, k_top, grid, TY, η★, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²) + Gⱽ[i, j, k_top-1])
+                          
         # time-averaging
-        η̅[i, j, k_top] += averaging_weight * η[i, j, k_top]
-        U̅[i, j, 1]     += averaging_weight * U[i, j, 1]
-        V̅[i, j, 1]     += averaging_weight * V[i, j, 1]
+        η̅[i, j, k_top]   += averaging_weight * η[i, j, k_top]
+        U̅[i, j, k_top-1] += averaging_weight * U[i, j, k_top-1]
+        V̅[i, j, k_top-1] += averaging_weight * V[i, j, k_top-1]
     end
 end
 
-function split_explicit_free_surface_substep!(η, state, auxiliary, settings, weights, arch, grid, g, Δτ, substep_index)
-    # unpack state quantities, parameters and forcing terms 
-    U, V             = state.U,    state.V
-    Uᵐ⁻¹, Uᵐ⁻²       = state.Uᵐ⁻¹, state.Uᵐ⁻²
-    Vᵐ⁻¹, Vᵐ⁻²       = state.Vᵐ⁻¹, state.Vᵐ⁻²
-    ηᵐ, ηᵐ⁻¹, ηᵐ⁻²   = state.ηᵐ,   state.ηᵐ⁻¹, state.ηᵐ⁻²
-    η̅, U̅, V̅          = state.η̅, state.U̅, state.V̅
-    Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ = auxiliary.Gᵁ, auxiliary.Gⱽ, auxiliary.Hᶠᶜ, auxiliary.Hᶜᶠ
-
-    timestepper      = settings.timestepper
-    averaging_weight = weights[substep_index]
-
-    parameters = auxiliary.kernel_parameters
-
-    args = (grid, Δτ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², U, V, Uᵐ⁻¹, Uᵐ⁻², Vᵐ⁻¹, Vᵐ⁻², 
-            η̅, U̅, V̅, averaging_weight, 
-            Gᵁ, Gⱽ, g, Hᶠᶜ, Hᶜᶠ, timestepper)
+# Barotropic Model Kernels
+# u_Δz = u * Δz
+@kernel function _barotropic_mode_kernel!(U, V, grid, u, v)
+    i, j  = @index(Global, NTuple)	
+    k_top = grid.Nz+1
 
-    launch!(arch, grid, parameters, split_explicit_free_surface_evolution_kernel!,        args...)
-    launch!(arch, grid, parameters, split_explicit_barotropic_velocity_evolution_kernel!, args...)
+    # hand unroll first loop
+    @inbounds U[i, j, k_top-1] = Δzᶠᶜᶜ(i, j, 1, grid) * u[i, j, 1]
+    @inbounds V[i, j, k_top-1] = Δzᶜᶠᶜ(i, j, 1, grid) * v[i, j, 1]
 
-    return nothing
+    @unroll for k in 2:grid.Nz
+        @inbounds U[i, j, k_top-1] += Δzᶠᶜᶜ(i, j, k, grid) * u[i, j, k]
+        @inbounds V[i, j, k_top-1] += Δzᶜᶠᶜ(i, j, k, grid) * v[i, j, k]
+    end
 end
 
 # Barotropic Model Kernels
 # u_Δz = u * Δz
-# Possibly we can optimize this further by passing in Val(Nz) in order to 
-# @unroll the loop over `k`.
-@kernel function _barotropic_mode_kernel!(U, V, grid, u, v)
-    i, j  = @index(Global, NTuple)
+@kernel function _barotropic_mode_kernel!(U, V, grid::ActiveZColumnsIBG, u, v)
+    idx = @index(Global, Linear)
+    i, j = active_linear_index_to_tuple(idx, ZColumnMap(), grid)
+    k_top = grid.Nz+1
 
-    @inbounds U[i, j, 1] = Δzᶠᶜᶜ(i, j, 1, grid) * u[i, j, 1]
-    @inbounds V[i, j, 1] = Δzᶜᶠᶜ(i, j, 1, grid) * v[i, j, 1]
+    # hand unroll first loop
+    @inbounds U[i, j, k_top-1] = Δzᶠᶜᶜ(i, j, 1, grid) * u[i, j, 1]
+    @inbounds V[i, j, k_top-1] = Δzᶜᶠᶜ(i, j, 1, grid) * v[i, j, 1]
 
-    for k in 2:grid.Nz
-        @inbounds U[i, j, 1] += Δzᶠᶜᶜ(i, j, k, grid) * u[i, j, k]
-        @inbounds V[i, j, 1] += Δzᶜᶠᶜ(i, j, k, grid) * v[i, j, k]
+    @unroll for k in 2:grid.Nz
+        @inbounds U[i, j, k_top-1] += Δzᶠᶜᶜ(i, j, k, grid) * u[i, j, k]
+        @inbounds V[i, j, k_top-1] += Δzᶜᶠᶜ(i, j, k, grid) * v[i, j, k]
     end
 end
 
 compute_barotropic_mode!(U, V, grid, u, v) = 
-    launch!(architecture(grid), grid, :xy, _barotropic_mode_kernel!, U, V, grid, u, v)
+    launch!(architecture(grid), grid, :xy, _barotropic_mode_kernel!, U, V, grid, u, v; 
+            active_cells_map = active_surface_map(grid))
 
-function initialize_free_surface_state!(free_surface_state, η)
-    state = free_surface_state
+function initialize_free_surface_state!(state, η, timestepper)
 
     parent(state.U) .= parent(state.U̅)
     parent(state.V) .= parent(state.V̅)
 
+    initialize_auxiliary_state!(state, η, timestepper)
+
+    fill!(state.η̅, 0)
+    fill!(state.U̅, 0)
+    fill!(state.V̅, 0)
+
+    return nothing
+end
+
+initialize_auxiliary_state!(state, η, ::ForwardBackwardScheme) = nothing
+
+function initialize_auxiliary_state!(state, η, timestepper)
     parent(state.Uᵐ⁻¹) .= parent(state.U̅)
     parent(state.Vᵐ⁻¹) .= parent(state.V̅)
 
@@ -235,35 +255,32 @@ function initialize_free_surface_state!(free_surface_state, η)
     parent(state.ηᵐ⁻¹) .= parent(η)
     parent(state.ηᵐ⁻²) .= parent(η)
 
-    fill!(state.η̅, 0)
-    fill!(state.U̅, 0)
-    fill!(state.V̅, 0)
-
     return nothing
 end
 
-@kernel function barotropic_split_explicit_corrector_kernel!(u, v, U̅, V̅, U, V, Hᶠᶜ, Hᶜᶠ)
+@kernel function _barotropic_split_explicit_corrector!(u, v, U̅, V̅, U, V, Hᶠᶜ, Hᶜᶠ, grid)
     i, j, k = @index(Global, NTuple)
+    k_top = grid.Nz+1
 
     @inbounds begin
-        u[i, j, k] = u[i, j, k] + (U̅[i, j] - U[i, j]) / Hᶠᶜ[i, j]
-        v[i, j, k] = v[i, j, k] + (V̅[i, j] - V[i, j]) / Hᶜᶠ[i, j]
+        u[i, j, k] = u[i, j, k] + (U̅[i, j, k_top-1] - U[i, j, k_top-1]) / Hᶠᶜ[i, j, 1]
+        v[i, j, k] = v[i, j, k] + (V̅[i, j, k_top-1] - V[i, j, k_top-1]) / Hᶜᶠ[i, j, 1]
     end
 end
 
 function barotropic_split_explicit_corrector!(u, v, free_surface, grid)
     sefs       = free_surface.state
-    Hᶠᶜ, Hᶜᶠ   = free_surface.auxiliary.Hᶠᶜ, free_surface.auxiliary.Hᶜᶠ
     U, V, U̅, V̅ = sefs.U, sefs.V, sefs.U̅, sefs.V̅
+    Hᶠᶜ, Hᶜᶠ   = free_surface.auxiliary.Hᶠᶜ, free_surface.auxiliary.Hᶜᶠ
     arch       = architecture(grid)
 
+
     # take out "bad" barotropic mode, 
     # !!!! reusing U and V for this storage since last timestep doesn't matter
     compute_barotropic_mode!(U, V, grid, u, v)
     # add in "good" barotropic mode
-
-    launch!(arch, grid, :xyz, barotropic_split_explicit_corrector_kernel!,
-            u, v, U̅, V̅, U, V, Hᶠᶜ, Hᶜᶠ)
+    launch!(arch, grid, :xyz, _barotropic_split_explicit_corrector!,
+            u, v, U̅, V̅, U, V, Hᶠᶜ, Hᶜᶠ, grid)
 
     return nothing
 end
@@ -288,11 +305,24 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     # Wait for previous set up
     wait_free_surface_communication!(free_surface, architecture(free_surface_grid))
 
+    # Calculate the substepping parameterers
+    settings = free_surface.settings 
+    Nsubsteps = calculate_substeps(settings.substepping, Δt)
+    
+    # barotropic time step as fraction of baroclinic step and averaging weights
+    fractional_Δt, weights = calculate_adaptive_settings(settings.substepping, Nsubsteps) 
+    Nsubsteps = length(weights)
+
+    # barotropic time step in seconds
+    Δτᴮ = fractional_Δt * Δt  
+    
     # reset free surface averages
     @apply_regionally begin 
-        initialize_free_surface_state!(free_surface.state, free_surface.η)
+        initialize_free_surface_state!(free_surface.state, free_surface.η, settings.timestepper)
+        
         # Solve for the free surface at tⁿ⁺¹
-        iterate_split_explicit!(free_surface, free_surface_grid, Δt)
+        iterate_split_explicit!(free_surface, free_surface_grid, Δτᴮ, weights, Val(Nsubsteps))
+        
         # Reset eta for the next timestep
         set!(free_surface.η, free_surface.state.η̅)
     end
@@ -324,7 +354,10 @@ const MINIMUM_SUBSTEPS = 5
 @inline calculate_adaptive_settings(substepping::FTS, substeps) = weights_from_substeps(eltype(substepping.Δt_barotropic),
                                                                                         substeps, substepping.averaging_kernel)
 
-function iterate_split_explicit!(free_surface, grid, Δt)
+const FixedSubstepsSetting{N} = SplitExplicitSettings{<:FixedSubstepNumber{<:Any, <:NTuple{N, <:Any}}} where N
+const FixedSubstepsSplitExplicit{F} = SplitExplicitFreeSurface{<:Any, <:Any, <:Any, <:Any, <:FixedSubstepsSetting{N}} where N
+
+function iterate_split_explicit!(free_surface, grid, Δτᴮ, weights, ::Val{Nsubsteps}) where Nsubsteps
     arch = architecture(grid)
 
     η         = free_surface.η
@@ -332,16 +365,46 @@ function iterate_split_explicit!(free_surface, grid, Δt)
     auxiliary = free_surface.auxiliary
     settings  = free_surface.settings
     g         = free_surface.gravitational_acceleration
-
-    Nsubsteps  = calculate_substeps(settings.substepping, Δt)
-    fractional_Δt, weights = calculate_adaptive_settings(settings.substepping, Nsubsteps) # barotropic time step in fraction of baroclinic step and averaging weights
-
-    Nsubsteps = length(weights)
-
-    Δτᴮ = fractional_Δt * Δt  # barotropic time step in seconds
-
-    for substep in 1:Nsubsteps
-        split_explicit_free_surface_substep!(η, state, auxiliary, settings, weights, arch, grid, g, Δτᴮ, substep)
+    
+    # unpack state quantities, parameters and forcing terms 
+    U, V             = state.U,    state.V
+    Uᵐ⁻¹, Uᵐ⁻²       = state.Uᵐ⁻¹, state.Uᵐ⁻²
+    Vᵐ⁻¹, Vᵐ⁻²       = state.Vᵐ⁻¹, state.Vᵐ⁻²
+    ηᵐ, ηᵐ⁻¹, ηᵐ⁻²   = state.ηᵐ,   state.ηᵐ⁻¹, state.ηᵐ⁻²
+    η̅, U̅, V̅          = state.η̅, state.U̅, state.V̅
+    Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ = auxiliary.Gᵁ, auxiliary.Gⱽ, auxiliary.Hᶠᶜ, auxiliary.Hᶜᶠ
+    
+    timestepper      = settings.timestepper
+    
+    parameters = auxiliary.kernel_parameters
+    
+    free_surface_kernel! = configured_kernel(arch, grid, parameters, _split_explicit_free_surface!)
+    barotropic_velocity_kernel! = configured_kernel(arch, grid, parameters, _split_explicit_barotropic_velocity!)
+
+    η_args = (grid, Δτᴮ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², 
+              U, V, Uᵐ⁻¹, Uᵐ⁻², Vᵐ⁻¹, Vᵐ⁻², 
+              timestepper)
+
+    U_args = (grid, Δτᴮ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², 
+              U, Uᵐ⁻¹, Uᵐ⁻², V,  Vᵐ⁻¹, Vᵐ⁻²,
+              η̅, U̅, V̅, Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, g, 
+              timestepper)
+
+    GC.@preserve η_args U_args begin
+
+        # We need to perform ~50 time-steps which means
+        # launching ~100 very small kernels: we are limited by
+        # latency of argument conversion to GPU-compatible values.
+        # To alleviate this penalty we convert first and then we substep!
+        converted_η_args = convert_args(arch, η_args)
+        converted_U_args = convert_args(arch, U_args)
+            
+        @unroll for substep in 1:Nsubsteps
+            Base.@_inline_meta
+            averaging_weight = weights[substep]
+            free_surface_kernel!(converted_η_args...)
+            barotropic_velocity_kernel!(averaging_weight, converted_U_args...)
+        end
     end
 
     return nothing
@@ -351,14 +414,32 @@ end
 # Possibly we can optimize this further by passing in Val(Nz) in order to 
 # @unroll the loop over `k`.
 @kernel function _compute_integrated_ab2_tendencies!(Gᵁ, Gⱽ, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
-    i, j  = @index(Global, NTuple)
+    i, j  = @index(Global, NTuple)	
+    k_top = grid.Nz+1
 
-    @inbounds Gᵁ[i, j, 1] = Δzᶠᶜᶜ(i, j, 1, grid) * ab2_step_Gu(i, j, 1, grid, Gu⁻, Guⁿ, χ)
-    @inbounds Gⱽ[i, j, 1] = Δzᶜᶠᶜ(i, j, 1, grid) * ab2_step_Gv(i, j, 1, grid, Gv⁻, Gvⁿ, χ)
+    # hand unroll first loop 	
+    @inbounds Gᵁ[i, j, k_top-1] = Δzᶠᶜᶜ(i, j, 1, grid) * ab2_step_Gu(i, j, 1, grid, Gu⁻, Guⁿ, χ)
+    @inbounds Gⱽ[i, j, k_top-1] = Δzᶜᶠᶜ(i, j, 1, grid) * ab2_step_Gv(i, j, 1, grid, Gv⁻, Gvⁿ, χ)
 
-    for k in 2:grid.Nz	
-        @inbounds Gᵁ[i, j, 1] += Δzᶠᶜᶜ(i, j, k, grid) * ab2_step_Gu(i, j, k, grid, Gu⁻, Guⁿ, χ)
-        @inbounds Gⱽ[i, j, 1] += Δzᶜᶠᶜ(i, j, k, grid) * ab2_step_Gv(i, j, k, grid, Gv⁻, Gvⁿ, χ)
+    @unroll for k in 2:grid.Nz	
+        @inbounds Gᵁ[i, j, k_top-1] += Δzᶠᶜᶜ(i, j, k, grid) * ab2_step_Gu(i, j, k, grid, Gu⁻, Guⁿ, χ)
+        @inbounds Gⱽ[i, j, k_top-1] += Δzᶜᶠᶜ(i, j, k, grid) * ab2_step_Gv(i, j, k, grid, Gv⁻, Gvⁿ, χ)
+    end	
+end
+
+# Calculate RHS for the barotopic time step. 
+@kernel function _compute_integrated_ab2_tendencies!(Gᵁ, Gⱽ, grid::ActiveZColumnsIBG, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
+    idx = @index(Global, Linear)
+    i, j = active_linear_index_to_tuple(idx, ZColumnMap(), grid)
+    k_top = grid.Nz+1
+
+    # hand unroll first loop 	
+    @inbounds Gᵁ[i, j, k_top-1] = Δzᶠᶜᶜ(i, j, 1, grid) * ab2_step_Gu(i, j, 1, grid, Gu⁻, Guⁿ, χ)
+    @inbounds Gⱽ[i, j, k_top-1] = Δzᶜᶠᶜ(i, j, 1, grid) * ab2_step_Gv(i, j, 1, grid, Gv⁻, Gvⁿ, χ)
+
+    @unroll for k in 2:grid.Nz	
+        @inbounds Gᵁ[i, j, k_top-1] += Δzᶠᶜᶜ(i, j, k, grid) * ab2_step_Gu(i, j, k, grid, Gu⁻, Guⁿ, χ)
+        @inbounds Gⱽ[i, j, k_top-1] += Δzᶜᶠᶜ(i, j, k, grid) * ab2_step_Gv(i, j, k, grid, Gv⁻, Gvⁿ, χ)
     end	
 end
 
@@ -371,9 +452,7 @@ end
 # Setting up the RHS for the barotropic step (tendencies of the barotopic velocity components)
 # This function is called after `calculate_tendency` and before `ab2_step_velocities!`
 function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
-
-    free_surface_grid = free_surface.η.grid
-
+    
     # we start the time integration of η from the average ηⁿ     
     Gu⁻ = model.timestepper.G⁻.u
     Gv⁻ = model.timestepper.G⁻.v
@@ -382,7 +461,7 @@ function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
 
     auxiliary = free_surface.auxiliary
 
-    @apply_regionally setup_split_explicit_tendency!(auxiliary, free_surface_grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
+    @apply_regionally setup_split_explicit_tendency!(auxiliary, model.grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
 
     fields_to_fill = (auxiliary.Gᵁ, auxiliary.Gⱽ)
     fill_halo_regions!(fields_to_fill; async = true)
@@ -391,6 +470,8 @@ function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
 end
 
 setup_split_explicit_tendency!(auxiliary, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ) =
-    launch!(architecture(grid), grid, :xy, _compute_integrated_ab2_tendencies!, auxiliary.Gᵁ, auxiliary.Gⱽ, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
-
+    launch!(architecture(grid), grid, :xy, _compute_integrated_ab2_tendencies!, auxiliary.Gᵁ, auxiliary.Gⱽ, grid, 
+            Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ; active_cells_map = active_surface_map(grid))
+            
 wait_free_surface_communication!(free_surface, arch) = nothing
+
diff --git a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
index 2473b6ff7d..fa5e391a42 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
@@ -4,6 +4,7 @@ using Oceananigans.TimeSteppers:  store_field_tendencies!
 
 using Oceananigans: prognostic_fields
 using Oceananigans.Grids: AbstractGrid
+using Oceananigans.ImmersedBoundaries: active_interior_map
 
 using Oceananigans.Utils: launch!
 
@@ -36,7 +37,6 @@ function store_tendencies!(model::HydrostaticFreeSurfaceModel)
         launch!(model.architecture, model.grid, :xyz,
                 store_field_tendencies!,
                 model.timestepper.G⁻[field_name],
-                model.grid,
                 model.timestepper.Gⁿ[field_name])
     end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index 4477505945..4cf7c19521 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -36,7 +36,6 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks; comp
     @apply_regionally update_model_field_time_series!(model, model.clock)
 
     fill_halo_regions!(prognostic_fields(model), model.clock, fields(model); async = true)
-
     @apply_regionally replace_horizontal_vector_halos!(model.velocities, model.grid)
     @apply_regionally compute_auxiliaries!(model)
 
diff --git a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
index 4284e70f22..70bc3b922a 100644
--- a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
@@ -3,8 +3,8 @@ using Oceananigans: fields, TendencyCallsite
 using Oceananigans.Utils: work_layout
 using Oceananigans.Models: complete_communication_and_compute_boundary!, interior_tendency_kernel_parameters
 
-using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, 
-                                       InteriorMap, active_linear_index_to_interior_tuple
+using Oceananigans.ImmersedBoundaries: active_interior_map, ActiveCellsIBG, 
+                                       InteriorMap, active_linear_index_to_tuple
 
 import Oceananigans.TimeSteppers: compute_tendencies!
 
@@ -28,7 +28,7 @@ function compute_tendencies!(model::NonhydrostaticModel, callbacks)
     # interior of the domain
     kernel_parameters = tuple(interior_tendency_kernel_parameters(model.grid))
 
-    compute_interior_tendency_contributions!(model, kernel_parameters; only_active_cells = use_only_active_interior_cells(model.grid))
+    compute_interior_tendency_contributions!(model, kernel_parameters; active_cells_map = active_interior_map(model.grid))
     complete_communication_and_compute_boundary!(model, model.grid, model.architecture)
 
     # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
@@ -50,7 +50,7 @@ function compute_tendencies!(model::NonhydrostaticModel, callbacks)
 end
 
 """ Store previous value of the source term and compute current source term. """
-function compute_interior_tendency_contributions!(model, kernel_parameters; only_active_cells = nothing)
+function compute_interior_tendency_contributions!(model, kernel_parameters; active_cells_map = nothing)
 
     tendencies           = model.timestepper.Gⁿ
     arch                 = model.architecture
@@ -91,16 +91,16 @@ function compute_interior_tendency_contributions!(model, kernel_parameters; only
 
     for parameters in kernel_parameters
         launch!(arch, grid, parameters, compute_Gu!, 
-                tendencies.u, grid, only_active_cells, u_kernel_args;
-                only_active_cells)
+                tendencies.u, grid, active_cells_map, u_kernel_args;
+                active_cells_map)
 
         launch!(arch, grid, parameters, compute_Gv!, 
-                tendencies.v, grid, only_active_cells, v_kernel_args;
-                only_active_cells)
+                tendencies.v, grid, active_cells_map, v_kernel_args;
+                active_cells_map)
 
         launch!(arch, grid, parameters, compute_Gw!, 
-                tendencies.w, grid, only_active_cells, w_kernel_args;
-                only_active_cells)
+                tendencies.w, grid, active_cells_map, w_kernel_args;
+                active_cells_map)
     end
 
     start_tracer_kernel_args = (advection, closure)
@@ -120,8 +120,8 @@ function compute_interior_tendency_contributions!(model, kernel_parameters; only
 
         for parameters in kernel_parameters
             launch!(arch, grid, parameters, compute_Gc!, 
-                    c_tendency, grid, only_active_cells, args;
-                    only_active_cells)
+                    c_tendency, grid, active_cells_map, args;
+                    active_cells_map)
         end
     end
 
@@ -138,9 +138,9 @@ end
     @inbounds Gu[i, j, k] = u_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_Gu!(Gu, grid::ActiveCellsIBG, ::InteriorMap, args) 
+@kernel function compute_Gu!(Gu, grid::ActiveCellsIBG, map::InteriorMap, args) 
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
+    i, j, k = active_linear_index_to_tuple(idx, map, grid)
     @inbounds Gu[i, j, k] = u_velocity_tendency(i, j, k, grid, args...)
 end
 
@@ -150,9 +150,9 @@ end
     @inbounds Gv[i, j, k] = v_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_Gv!(Gv, grid::ActiveCellsIBG, ::InteriorMap, args) 
+@kernel function compute_Gv!(Gv, grid::ActiveCellsIBG, map::InteriorMap, args) 
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
+    i, j, k = active_linear_index_to_tuple(idx, map, grid)
     @inbounds Gv[i, j, k] = v_velocity_tendency(i, j, k, grid, args...)
 end
 
@@ -162,9 +162,9 @@ end
     @inbounds Gw[i, j, k] = w_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_Gw!(Gw, grid::ActiveCellsIBG, ::InteriorMap, args)
+@kernel function compute_Gw!(Gw, grid::ActiveCellsIBG, map, ::InteriorMap, args)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
+    i, j, k = active_linear_index_to_tuple(idx, map, grid)
     @inbounds Gw[i, j, k] = w_velocity_tendency(i, j, k, grid, args...)
 end
 
@@ -178,9 +178,9 @@ end
     @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_Gc!(Gc, grid::ActiveCellsIBG, ::InteriorMap, args) 
+@kernel function compute_Gc!(Gc, grid::ActiveCellsIBG, map::InteriorMap, args) 
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
+    i, j, k = active_linear_index_to_tuple(idx, map, grid)
     @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, args...)
 end
 
diff --git a/src/MultiRegion/multi_region_split_explicit_free_surface.jl b/src/MultiRegion/multi_region_split_explicit_free_surface.jl
index 7349324fec..d33f6bd1a6 100644
--- a/src/MultiRegion/multi_region_split_explicit_free_surface.jl
+++ b/src/MultiRegion/multi_region_split_explicit_free_surface.jl
@@ -11,14 +11,11 @@ function SplitExplicitAuxiliaryFields(grid::MultiRegionGrids)
 
     Hᶠᶜ = Field((Face,   Center, Nothing), grid)
     Hᶜᶠ = Field((Center, Face,   Nothing), grid)
-    Hᶜᶜ = Field((Center, Center, Nothing), grid)
 
     @apply_regionally calculate_column_height!(Hᶠᶜ, (Face, Center, Center))
     @apply_regionally calculate_column_height!(Hᶜᶠ, (Center, Face, Center))
 
-    @apply_regionally calculate_column_height!(Hᶜᶜ, (Center, Center, Center))
-
-    fill_halo_regions!((Hᶠᶜ, Hᶜᶠ, Hᶜᶜ))
+    fill_halo_regions!((Hᶠᶜ, Hᶜᶠ))
 
     # In a non-parallel grid we calculate only the interior
     @apply_regionally kernel_size    = augmented_kernel_size(grid, grid.partition)
@@ -26,7 +23,7 @@ function SplitExplicitAuxiliaryFields(grid::MultiRegionGrids)
     
     @apply_regionally kernel_parameters = KernelParameters(kernel_size, kernel_offsets)
 
-    return SplitExplicitAuxiliaryFields(Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, Hᶜᶜ, kernel_parameters)
+    return SplitExplicitAuxiliaryFields(Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, kernel_parameters)
 end
 
 @inline function calculate_column_height!(height, location)
@@ -55,7 +52,7 @@ function FreeSurface(free_surface::SplitExplicitFreeSurface, velocities, grid::M
     η = ZFaceField(new_grid, indices = (:, :, size(new_grid, 3)+1))
 
     return SplitExplicitFreeSurface(η,
-                                    SplitExplicitState(new_grid),
+                                    SplitExplicitState(new_grid, free_surface.settings.timestepper),
                                     SplitExplicitAuxiliaryFields(new_grid),
                                     free_surface.gravitational_acceleration,
                                     free_surface.settings)
diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 8cd68a6631..ec5872a846 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -129,7 +129,10 @@ end
 @kernel function solve_batched_tridiagonal_system_kernel!(ϕ, a, b, c, f, t, grid, p, args, tridiagonal_direction::XDirection)
     Nx = size(grid, 1)
     j, k = @index(Global, NTuple)
+    solve_batched_tridiagonal_system_x!(j, k, Nx, ϕ, a, b, c, f, t, grid, p, args, tridiagonal_direction)
+end
 
+@inline function solve_batched_tridiagonal_system_x!(j, k, Nx, ϕ, a, b, c, f, t, grid, p, args, tridiagonal_direction)
     @inbounds begin
         β  = get_coefficient(1, j, k, grid, b, p, tridiagonal_direction, args...)
         f₁ = get_coefficient(1, j, k, grid, f, p, tridiagonal_direction, args...)
@@ -161,7 +164,10 @@ end
 @kernel function solve_batched_tridiagonal_system_kernel!(ϕ, a, b, c, f, t, grid, p, args, tridiagonal_direction::YDirection)
     Ny = size(grid, 2)
     i, k = @index(Global, NTuple)
+    solve_batched_tridiagonal_system_y!(i, k, Ny, ϕ, a, b, c, f, t, grid, p, args, tridiagonal_direction)
+end
 
+@inline function solve_batched_tridiagonal_system_y!(i, k, Ny, ϕ, a, b, c, f, t, grid, p, args, tridiagonal_direction)
     @inbounds begin
         β  = get_coefficient(i, 1, k, grid, b, p, tridiagonal_direction, args...)
         f₁ = get_coefficient(i, 1, k, grid, f, p, tridiagonal_direction, args...)
@@ -193,7 +199,10 @@ end
 @kernel function solve_batched_tridiagonal_system_kernel!(ϕ, a, b, c, f, t, grid, p, args, tridiagonal_direction::ZDirection)
     Nz = size(grid, 3)
     i, j = @index(Global, NTuple)
+    solve_batched_tridiagonal_system_z!(i, j, Nz, ϕ, a, b, c, f, t, grid, p, args, tridiagonal_direction)
+end
 
+@inline function solve_batched_tridiagonal_system_z!(i, j, Nz, ϕ, a, b, c, f, t, grid, p, args, tridiagonal_direction)
     @inbounds begin
         β  = get_coefficient(i, j, 1, grid, b, p, tridiagonal_direction, args...)
         f₁ = get_coefficient(i, j, 1, grid, f, p, tridiagonal_direction, args...)
diff --git a/src/TimeSteppers/quasi_adams_bashforth_2.jl b/src/TimeSteppers/quasi_adams_bashforth_2.jl
index 19f5f1beb3..780df779b6 100644
--- a/src/TimeSteppers/quasi_adams_bashforth_2.jl
+++ b/src/TimeSteppers/quasi_adams_bashforth_2.jl
@@ -1,6 +1,7 @@
 using Oceananigans.Fields: FunctionField, location
 using Oceananigans.TurbulenceClosures: implicit_step!
 using Oceananigans.Utils: @apply_regionally, apply_regionally!
+using Oceananigans.ImmersedBoundaries: ActiveCellsIBG, active_linear_index_to_tuple
 
 mutable struct QuasiAdamsBashforth2TimeStepper{FT, GT, IT} <: AbstractTimeStepper
                   χ :: FT
diff --git a/src/TimeSteppers/store_tendencies.jl b/src/TimeSteppers/store_tendencies.jl
index 06d179bd3a..e2f1b07c7e 100644
--- a/src/TimeSteppers/store_tendencies.jl
+++ b/src/TimeSteppers/store_tendencies.jl
@@ -1,10 +1,10 @@
 using Oceananigans: prognostic_fields
 using Oceananigans.Grids: AbstractGrid
-
+using Oceananigans.ImmersedBoundaries: ActiveCellsIBG
 using Oceananigans.Utils: launch!
 
 """ Store source terms for `u`, `v`, and `w`. """
-@kernel function store_field_tendencies!(G⁻, grid, G⁰)
+@kernel function store_field_tendencies!(G⁻, G⁰)
     i, j, k = @index(Global, NTuple)
     @inbounds G⁻[i, j, k] = G⁰[i, j, k]
 end
@@ -16,7 +16,6 @@ function store_tendencies!(model)
     for field_name in keys(model_fields)
         launch!(model.architecture, model.grid, :xyz, store_field_tendencies!,
                 model.timestepper.G⁻[field_name],
-                model.grid,
                 model.timestepper.Gⁿ[field_name])
     end
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index b65de06ff4..db17a26370 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -3,7 +3,6 @@ using Oceananigans.BuoyancyModels: ∂z_b
 using Oceananigans.Operators
 using Oceananigans.Grids: inactive_node
 using Oceananigans.Operators: ℑzᵃᵃᶜ
-using Oceananigans.Utils: use_only_active_interior_cells
 
 struct RiBasedVerticalDiffusivity{TD, FT, R} <: AbstractScalarDiffusivity{TD, VerticalFormulation, 1}
     ν₀  :: FT
diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index 71fa9bd8e4..56933e77e9 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -178,7 +178,8 @@ function implicit_step!(field::Field,
                         diffusivity_fields,
                         tracer_index,
                         clock,
-                        Δt)
+                        Δt; 
+                        kwargs...)
     
    loc = location(field)
 
@@ -208,6 +209,6 @@ function implicit_step!(field::Field,
 
     return solve!(field, implicit_solver, field,
                   # ivd_*_diagonal gets called with these args after (i, j, k, grid):
-                  vi_closure, vi_diffusivity_fields, tracer_index, map(ℓ -> ℓ(), loc)..., clock, Δt, κz)
+                  vi_closure, vi_diffusivity_fields, tracer_index, map(ℓ -> ℓ(), loc)..., clock, Δt, κz; kwargs...)
 end
 
diff --git a/src/Utils/Utils.jl b/src/Utils/Utils.jl
index 6c6543692c..761c8d23a6 100644
--- a/src/Utils/Utils.jl
+++ b/src/Utils/Utils.jl
@@ -1,6 +1,6 @@
 module Utils
 
-export launch_config, work_layout, launch!, KernelParameters
+export configured_kernel, work_layout, launch!, KernelParameters
 export prettytime, pretty_filesize
 export tupleit, parenttuple, datatuple, datatuples
 export validate_intervals, time_to_run
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 5a2589c5e1..b0d3338669 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -7,7 +7,6 @@ using Oceananigans.Utils
 using Oceananigans.Grids
 
 using Oceananigans.Grids: AbstractGrid
-
 import Base
 
 struct KernelParameters{S, O} end
@@ -106,8 +105,7 @@ function work_layout(grid, workdims::Symbol; include_right_boundaries=false, loc
     return workgroup, worksize
 end
 
-@inline active_cells_work_layout(workgroup, worksize, only_active_cells, grid) = workgroup, worksize
-@inline use_only_active_interior_cells(grid) = nothing
+@inline active_cells_work_layout(workgroup, worksize, active_cells_map, grid) = workgroup, worksize
 
 """
     launch!(arch, grid, layout, kernel!, args...; kwargs...)
@@ -119,9 +117,55 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
                  include_right_boundaries = false,
                  reduced_dimensions = (),
                  location = nothing,
-                 only_active_cells = nothing,
+                 active_cells_map = nothing,
                  kwargs...)
 
+    loop! = configured_kernel(arch, grid, workspec, kernel!;
+                              include_right_boundaries,
+                              reduced_dimensions,
+                              location,
+                              active_cells_map,
+                              kwargs...)
+    
+    
+    !isnothing(loop!) && loop!(kernel_args...)
+    
+    return nothing
+end
+
+
+"""
+    configured_kernel(arch, grid, workspec, kernel!; include_right_boundaries=false, reduced_dimensions=(), location=nothing, active_cells_map=nothing, kwargs...)
+
+Configures a kernel with the specified architecture, grid, workspec, and kernel function.
+If `active_cells_map` is provided, the work distribution is adjusted to include only the active cells.
+If the worksize is 0 after adjusting for active cells, the function returns `nothing`.
+
+# Arguments
+============
+
+- `arch`: The architecture on which the kernel will be launched.
+- `grid`: The grid on which the kernel will be executed.
+- `workspec`: The workspec that defines the work distribution.
+- `kernel!`: The kernel function to be executed.
+
+# Keyword Arguments
+====================
+
+- `include_right_boundaries`: A boolean indicating whether to include right boundaries `(N + 1)`. Default is `false`.
+- `reduced_dimensions`: A tuple specifying the dimensions to be reduced in the work distribution. Default is an empty tuple.
+- `location`: The location of the kernel execution, needed for `include_right_boundaries`. Default is `nothing`.
+- `active_cells_map`: A map indicating the active cells in the grid. If the map is not a nothing, the workspec will be disregarded and 
+                     the kernel is configured as a linear kernel with a worksize equal to the length of the active cell map. Default is `nothing`.
+"""
+
+function configured_kernel(arch, grid, workspec, kernel!;
+                           include_right_boundaries = false,
+                           reduced_dimensions = (),
+                           location = nothing,
+                           active_cells_map = nothing,
+                           kwargs...)
+
     workgroup, worksize = work_layout(grid, workspec;
                                       include_right_boundaries,
                                       reduced_dimensions,
@@ -129,26 +173,23 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
 
     offset = offsets(workspec)
 
-    if !isnothing(only_active_cells) 
-        workgroup, worksize = active_cells_work_layout(workgroup, worksize, only_active_cells, grid) 
+    if !isnothing(active_cells_map) 
+        workgroup, worksize = active_cells_work_layout(workgroup, worksize, active_cells_map, grid) 
         offset = nothing
-    end
 
-    if worksize == 0
-        return nothing
+        # A non active domain! 
+        if worksize == 0
+            return nothing
+        end
     end
-    
+
     # We can only launch offset kernels with Static sizes!!!!
     loop! = isnothing(offset) ? kernel!(Architectures.device(arch), workgroup, worksize) : 
                                 kernel!(Architectures.device(arch), StaticSize(workgroup), OffsetStaticSize(contiguousrange(worksize, offset))) 
 
-    @debug "Launching kernel $kernel! with worksize $worksize and offsets $offset from $workspec"
-
-    loop!(kernel_args...)
-
-    return nothing
+    return loop!
 end
-
+        
 # When dims::Val
 @inline launch!(arch, grid, ::Val{workspec}, args...; kwargs...) where workspec =
     launch!(arch, grid, workspec, args...; kwargs...)
diff --git a/test/test_split_explicit_free_surface_solver.jl b/test/test_split_explicit_free_surface_solver.jl
index ccb8aeb005..b4e7a53b50 100644
--- a/test/test_split_explicit_free_surface_solver.jl
+++ b/test/test_split_explicit_free_surface_solver.jl
@@ -1,7 +1,7 @@
 include("dependencies_for_runtests.jl")
 using Oceananigans.Models.HydrostaticFreeSurfaceModels
 import Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitFreeSurface
-import Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitState, SplitExplicitAuxiliaryFields, SplitExplicitSettings, split_explicit_free_surface_substep!
+import Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitState, SplitExplicitAuxiliaryFields, SplitExplicitSettings, iterate_split_explicit!
 
 using Oceananigans.Models.HydrostaticFreeSurfaceModels: constant_averaging_kernel
 using Oceananigans.Models.HydrostaticFreeSurfaceModels: calculate_substeps, calculate_adaptive_settings
@@ -12,51 +12,35 @@ using Oceananigans.Models.HydrostaticFreeSurfaceModels: calculate_substeps, calc
         for arch in archs
             topology = (Periodic, Periodic, Bounded)
 
-            Nx, Ny, Nz = 128, 64, 16
-            Lx = Ly = Lz = 2π
+            Nx, Ny, Nz = 128, 64, 1
+            Lx = Ly = 2π
+            Lz = 1 / Oceananigans.BuoyancyModels.g_Earth
 
             grid = RectilinearGrid(arch, FT;
                                    topology, size = (Nx, Ny, Nz),
                                    x = (0, Lx), y = (0, Ly), z = (-Lz, 0),
                                    halo=(1, 1, 1))
 
-            settings = SplitExplicitSettings(; substeps = 200, averaging_kernel = constant_averaging_kernel)
+            settings = SplitExplicitSettings(eltype(grid); substeps = 200, averaging_kernel = constant_averaging_kernel)
             sefs = SplitExplicitFreeSurface(grid; settings)
 
             sefs.η .= 0
 
             @testset " One timestep test " begin
                 state = sefs.state
-                auxiliary = sefs.auxiliary
                 U, V, η̅, U̅, V̅ = state.U, state.V, state.η̅, state.U̅, state.V̅
-                Gᵁ, Gⱽ = auxiliary.Gᵁ, auxiliary.Gⱽ
-                Hᶠᶜ, Hᶜᶠ = sefs.auxiliary.Hᶠᶜ, sefs.auxiliary.Hᶜᶠ
-                g = sefs.gravitational_acceleration
 
-                Hᶠᶜ .= 1 / g
-                Hᶜᶠ .= 1 / g
                 η = sefs.η
-                velocity_weight = 0.0
-                free_surface_weight = 0.0
                 Δτ = 1.0
 
                 η₀(x, y, z) = sin(x)
                 set!(η, η₀)
-                U₀(x, y) = 0
-                set!(U, U₀)
-                V₀(x, y) = 0
-                set!(V, V₀)
-
-                η̅ .= 0
-                U̅ .= 0
-                V̅ .= 0
-                Gᵁ .= 0
-                Gⱽ .= 0
-
-                Nsubsteps  = calculate_substeps(settings.substepping, 1)
+            
+                Nsubsteps = calculate_substeps(settings.substepping, 1)
                 fractional_Δt, weights = calculate_adaptive_settings(settings.substepping, Nsubsteps) # barotropic time step in fraction of baroclinic step and averaging weights
 
-                split_explicit_free_surface_substep!(η, sefs.state, sefs.auxiliary, sefs.settings, weights, arch, grid, g, Δτ, 1)
+                iterate_split_explicit!(sefs, grid, Δτ, weights, Val(1)) 
+
                 U_computed = Array(U.data.parent)[2:Nx+1, 2:Ny+1]
                 U_exact = (reshape(-cos.(grid.xᶠᵃᵃ), (length(grid.xᶜᵃᵃ), 1)).+reshape(0 * grid.yᵃᶜᵃ, (1, length(grid.yᵃᶜᵃ))))[2:Nx+1, 2:Ny+1]
 
@@ -68,50 +52,43 @@ using Oceananigans.Models.HydrostaticFreeSurfaceModels: calculate_substeps, calc
                 auxiliary = sefs.auxiliary
                 U, V, η̅, U̅, V̅ = state.U, state.V, state.η̅, state.U̅, state.V̅
                 Gᵁ, Gⱽ = auxiliary.Gᵁ, auxiliary.Gⱽ
-                g = sefs.gravitational_acceleration
-                sefs.auxiliary.Hᶠᶜ .= 1 / g
-                sefs.auxiliary.Hᶜᶠ .= 1 / g
                 η = sefs.η
-                velocity_weight = 0.0
-                free_surface_weight = 0.0
 
-                T = 2π
+                T  = 2π
                 Δτ = 2π / maximum([Nx, Ny]) * 5e-2 # the last factor is essentially the order of accuracy
                 Nt = floor(Int, T / Δτ)
                 Δτ_end = T - Nt * Δτ
 
                 settings = SplitExplicitSettings(; substeps = Nt, averaging_kernel = constant_averaging_kernel)
+                sefs = sefs(settings)
 
                 # set!(η, f(x,y))
                 η₀(x, y, z) = sin(x)
                 set!(η, η₀)
-                U₀(x, y) = 0
+                U₀(x, y, z) = 0
                 set!(U, U₀)
-                V₀(x, y) = 0
+                V₀(x, y, z) = 0
                 set!(V, V₀)
 
-                η̅ .= 0
-                U̅ .= 0
-                V̅ .= 0
+                η̅  .= 0
+                U̅  .= 0
+                V̅  .= 0
                 Gᵁ .= 0
                 Gⱽ .= 0
 
-                Nsubsteps  = calculate_substeps(settings.substepping, 1)
-                fractional_Δt, weights = calculate_adaptive_settings(settings.substepping, Nsubsteps) # barotropic time step in fraction of baroclinic step and averaging weights
-    
+                weights = settings.substepping.averaging_weights
+                
                 for i in 1:Nt
-                    split_explicit_free_surface_substep!(η, sefs.state, sefs.auxiliary, settings, weights, arch, grid, g, Δτ, i)
+                    iterate_split_explicit!(sefs, grid, Δτ, weights, Val(1)) 
                 end
-
-                # + correction for exact time
-                split_explicit_free_surface_substep!(η, sefs.state, sefs.auxiliary, settings, weights, arch, grid, g, Δτ_end, 1)
-
-                U_computed = Array(parent(U))[2:Nx+1, 2:Ny+1]
-                η_computed = Array(parent(η))[2:Nx+1, 2:Ny+1]
+                iterate_split_explicit!(sefs, grid, Δτ_end, weights, Val(1)) 
+    
+                U_computed = Array(deepcopy(interior(U)))
+                η_computed = Array(deepcopy(interior(η)))
                 set!(η, η₀)
                 set!(U, U₀)
-                U_exact = Array(parent(U))[2:Nx+1, 2:Ny+1]
-                η_exact = Array(parent(η))[2:Nx+1, 2:Ny+1]
+                U_exact = Array(deepcopy(interior(U)))
+                η_exact = Array(deepcopy(interior(η)))
 
                 @test maximum(abs.(U_computed - U_exact)) < 1e-3
                 @show maximum(abs.(η_computed))
@@ -119,6 +96,11 @@ using Oceananigans.Models.HydrostaticFreeSurfaceModels: calculate_substeps, calc
                 @test maximum(abs.(η_computed - η_exact)) < max(100eps(FT), 1e-6)
             end
 
+            settings = SplitExplicitSettings(eltype(grid); substeps = 200, averaging_kernel = constant_averaging_kernel)
+            sefs = SplitExplicitFreeSurface(grid; settings)
+
+            sefs.η .= 0
+
             @testset "Averaging / Do Nothing test " begin
                 state = sefs.state
                 auxiliary = sefs.auxiliary
@@ -126,46 +108,40 @@ using Oceananigans.Models.HydrostaticFreeSurfaceModels: calculate_substeps, calc
                 Gᵁ, Gⱽ = auxiliary.Gᵁ, auxiliary.Gⱽ
 
                 g = sefs.gravitational_acceleration
-                sefs.auxiliary.Hᶠᶜ .= 1 / g
-                sefs.auxiliary.Hᶜᶠ .= 1 / g
                 η = sefs.η
-                velocity_weight = 0.0
-                free_surface_weight = 0.0
 
-                Δτ = 2π / maximum([Nx, Ny]) * 5e-2 # the last factor is essentially the order of accuracy
+                Δτ = 2π / maximum([Nx, Ny]) * 1e-2 # the last factor is essentially the order of accuracy
 
                 # set!(η, f(x, y))
                 η_avg = 1
                 U_avg = 2
                 V_avg = 3
-                η₀(x, y, z) = η_avg
-                set!(η, η₀)
-                U₀(x, y) = U_avg
-                set!(U, U₀)
-                V₀(x, y) = V_avg
-                set!(V, V₀)
+                fill!(η, η_avg)
+                fill!(U, U_avg)
+                fill!(V, V_avg)
+
+                fill!(η̅ , 0)
+                fill!(U̅ , 0)
+                fill!(V̅ , 0)
+                fill!(Gᵁ, 0)
+                fill!(Gⱽ, 0)
 
-                η̅ .= 0
-                U̅ .= 0
-                V̅ .= 0
-                Gᵁ .= 0
-                Gⱽ .= 0
                 settings = sefs.settings
 
                 Nsubsteps  = calculate_substeps(settings.substepping, 1)
                 fractional_Δt, weights = calculate_adaptive_settings(settings.substepping, Nsubsteps) # barotropic time step in fraction of baroclinic step and averaging weights
-    
-                for i in 1:Nsubsteps
-                    split_explicit_free_surface_substep!(η, sefs.state, sefs.auxiliary, sefs.settings, weights, arch, grid, g, Δτ, i)
+                
+                for step in 1:Nsubsteps
+                    iterate_split_explicit!(sefs, grid, Δτ, weights, Val(1)) 
                 end
 
-                U_computed = Array(U.data.parent)[2:Nx+1, 2:Ny+1]
-                V_computed = Array(V.data.parent)[2:Nx+1, 2:Ny+1]
-                η_computed = Array(η.data.parent)[2:Nx+1, 2:Ny+1]
+                U_computed = Array(deepcopy(interior(U)))
+                V_computed = Array(deepcopy(interior(V)))
+                η_computed = Array(deepcopy(interior(η)))
 
-                U̅_computed = Array(U̅.data.parent)[2:Nx+1, 2:Ny+1]
-                V̅_computed = Array(V̅.data.parent)[2:Nx+1, 2:Ny+1]
-                η̅_computed = Array(η̅.data.parent)[2:Nx+1, 2:Ny+1]
+                U̅_computed = Array(deepcopy(interior(U̅)))
+                V̅_computed = Array(deepcopy(interior(V̅)))
+                η̅_computed = Array(deepcopy(interior(η̅)))
 
                 tolerance = 100eps(FT)
 
@@ -197,8 +173,6 @@ using Oceananigans.Models.HydrostaticFreeSurfaceModels: calculate_substeps, calc
                 Gᵁ, Gⱽ = auxiliary.Gᵁ, auxiliary.Gⱽ
                 η = sefs.η
                 g = sefs.gravitational_acceleration
-                sefs.auxiliary.Hᶠᶜ .= 1 / g # to make life easy
-                sefs.auxiliary.Hᶜᶠ .= 1 / g # to make life easy
 
                 # set!(η, f(x,y)) k² = ω²
                 gu_c = 1
@@ -216,47 +190,44 @@ using Oceananigans.Models.HydrostaticFreeSurfaceModels: calculate_substeps, calc
                 Gᵁ .= gu_c
                 Gⱽ .= gv_c
 
-                settings = SplitExplicitSettings(substeps = Nt + 1, averaging_kernel = constant_averaging_kernel)
+                settings = SplitExplicitSettings(eltype(grid); substeps = Nt + 1, averaging_kernel = constant_averaging_kernel)
                 sefs = sefs(settings)
 
-                Nsubsteps  = calculate_substeps(settings.substepping, 1)
-                fractional_Δt, weights = calculate_adaptive_settings(settings.substepping, Nsubsteps) # barotropic time step in fraction of baroclinic step and averaging weights
-    
+                weights = settings.substepping.averaging_weights
                 for i in 1:Nt
-                    split_explicit_free_surface_substep!(η, sefs.state, sefs.auxiliary, sefs.settings, weights, arch, grid, g, Δτ, i)
+                    iterate_split_explicit!(sefs, grid, Δτ, weights, Val(1)) 
                 end
-                # + correction for exact time
-                split_explicit_free_surface_substep!(η, sefs.state, sefs.auxiliary, sefs.settings, weights, arch, grid, g, Δτ_end, Nt + 1)
+                iterate_split_explicit!(sefs, grid, Δτ_end, weights, Val(1)) 
 
                 η_mean_after = mean(Array(interior(η)))
 
                 tolerance = 10eps(FT)
                 @test abs(η_mean_after - η_mean_before) < tolerance
 
-                η_computed = Array(η.data.parent)[2:Nx+1, 2:Ny+1]
-                U_computed = Array(U.data.parent)[2:Nx+1, 2:Ny+1]
-                V_computed = Array(V.data.parent)[2:Nx+1, 2:Ny+1]
+                η_computed = Array(deepcopy(interior(η, :, 1, 1)))
+                U_computed = Array(deepcopy(interior(U, :, 1, 1)))
+                V_computed = Array(deepcopy(interior(V, :, 1, 1)))
 
-                η̅_computed = Array(η̅.data.parent)[2:Nx+1, 2:Ny+1]
-                U̅_computed = Array(U̅.data.parent)[2:Nx+1, 2:Ny+1]
-                V̅_computed = Array(V̅.data.parent)[2:Nx+1, 2:Ny+1]
+                η̅_computed = Array(deepcopy(interior(η̅, :, 1, 1)))
+                U̅_computed = Array(deepcopy(interior(U̅, :, 1, 1)))
+                V̅_computed = Array(deepcopy(interior(V̅, :, 1, 1)))
 
                 set!(η, η₀)
 
                 # ∂ₜₜ(η) = Δη
-                η_exact = cos(ω * T) * (Array(η.data.parent)[2:Nx+1, 2:Ny+1] .- 1) .+ 1
+                η_exact = cos(ω * T) * (Array(interior(η, :, 1, 1)) .- 1) .+ 1
 
-                U₀(x, y) = kx * cos(kx * x) * sin(ky * y) # ∂ₜU = - ∂x(η), since we know η
+                U₀(x, y, z) = kx * cos(kx * x) * sin(ky * y) # ∂ₜU = - ∂x(η), since we know η
                 set!(U, U₀)
-                U_exact = -(sin(ω * T) * 1 / ω) .* Array(U.data.parent)[2:Nx+1, 2:Ny+1] .+ gu_c * T
+                U_exact = -(sin(ω * T) * 1 / ω) .* Array(interior(U, :, 1, 1)) .+ gu_c * T
 
-                V₀(x, y) = ky * sin(kx * x) * cos(ky * y) # ∂ₜV = - ∂y(η), since we know η
+                V₀(x, y, z) = ky * sin(kx * x) * cos(ky * y) # ∂ₜV = - ∂y(η), since we know η
                 set!(V, V₀)
-                V_exact = -(sin(ω * T) * 1 / ω) .* Array(V.data.parent)[2:Nx+1, 2:Ny+1] .+ gv_c * T
+                V_exact = -(sin(ω * T) * 1 / ω) .* Array(interior(V, :, 1, 1)) .+ gv_c * T
 
-                η̅_exact = (sin(ω * T) / ω - sin(ω * 0) / ω) / T * (Array(η.data.parent)[2:Nx+1, 2:Ny+1] .- 1) .+ 1
-                U̅_exact = (cos(ω * T) * 1 / ω^2 - cos(ω * 0) * 1 / ω^2) / T * Array(U.data.parent)[2:Nx+1, 2:Ny+1] .+ gu_c * T / 2
-                V̅_exact = (cos(ω * T) * 1 / ω^2 - cos(ω * 0) * 1 / ω^2) / T * Array(V.data.parent)[2:Nx+1, 2:Ny+1] .+ gv_c * T / 2
+                η̅_exact = (sin(ω * T) / ω - sin(ω * 0) / ω) / T * (Array(interior(η, :, 1, 1)) .- 1) .+ 1
+                U̅_exact = (cos(ω * T) * 1 / ω^2 - cos(ω * 0) * 1 / ω^2) / T * Array(interior(U, :, 1, 1)) .+ gu_c * T / 2
+                V̅_exact = (cos(ω * T) * 1 / ω^2 - cos(ω * 0) * 1 / ω^2) / T * Array(interior(V, :, 1, 1)) .+ gv_c * T / 2
 
                 tolerance = 1e-2
 
diff --git a/test/test_split_explicit_vertical_integrals.jl b/test/test_split_explicit_vertical_integrals.jl
index aad8fbd8d6..6016053926 100644
--- a/test/test_split_explicit_vertical_integrals.jl
+++ b/test/test_split_explicit_vertical_integrals.jl
@@ -1,8 +1,8 @@
 include("dependencies_for_runtests.jl")
 using Oceananigans.Models.HydrostaticFreeSurfaceModels
-import Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitFreeSurface
-import Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitState, SplitExplicitAuxiliaryFields, SplitExplicitSettings
-import Oceananigans.Models.HydrostaticFreeSurfaceModels: compute_barotropic_mode!, barotropic_split_explicit_corrector!, initialize_free_surface_state!
+using Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitFreeSurface
+using Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitState, SplitExplicitAuxiliaryFields, SplitExplicitSettings
+using Oceananigans.Models.HydrostaticFreeSurfaceModels: compute_barotropic_mode!, barotropic_split_explicit_corrector!, initialize_free_surface_state!
 
 @testset "Barotropic Kernels" begin
 
@@ -17,7 +17,7 @@ import Oceananigans.Models.HydrostaticFreeSurfaceModels: compute_barotropic_mode
         grid = RectilinearGrid(arch, topology = topology, size = (Nx, Ny, Nz), x = (0, Lx), y = (0, Ly), z = (-Lz, 0))
 
         tmp = SplitExplicitFreeSurface(; substeps = 200)
-        sefs = SplitExplicitState(grid)
+        sefs = SplitExplicitState(grid, tmp.settings.timestepper)
         sefs = SplitExplicitAuxiliaryFields(grid)
         sefs = SplitExplicitFreeSurface(grid)
 
@@ -25,7 +25,6 @@ import Oceananigans.Models.HydrostaticFreeSurfaceModels: compute_barotropic_mode
         auxiliary = sefs.auxiliary
         U, V, η̅, U̅, V̅ = state.U, state.V, state.η̅, state.U̅, state.V̅
         Gᵁ, Gⱽ = auxiliary.Gᵁ, auxiliary.Gⱽ
-        Hᶠᶜ, Hᶜᶠ = sefs.auxiliary.Hᶠᶜ, sefs.auxiliary.Hᶜᶠ
 
         u = Field{Face,Center,Center}(grid)
         v = Field{Center,Face,Center}(grid)
@@ -34,7 +33,7 @@ import Oceananigans.Models.HydrostaticFreeSurfaceModels: compute_barotropic_mode
             # set equal to something else
             η̅ .= U̅ .= V̅ .= 1.0
             # now set equal to zero
-            initialize_free_surface_state!(sefs.state, sefs.η)
+            initialize_free_surface_state!(sefs.state, sefs.η, sefs.settings.timestepper)
             # don't forget the ghost points
             fill_halo_regions!(η̅)
             fill_halo_regions!(U̅)
@@ -51,7 +50,7 @@ import Oceananigans.Models.HydrostaticFreeSurfaceModels: compute_barotropic_mode
             Δz .= grid.Δzᵃᵃᶠ
 
             set_u_check(x, y, z) = cos((π / 2) * z / Lz)
-            set_U_check(x, y) = (sin(0) - (-2 * Lz / (π)))
+            set_U_check(x, y, z) = (sin(0) - (-2 * Lz / (π)))
             set!(u, set_u_check)
             exact_U = similar(U)
             set!(exact_U, set_U_check)
@@ -60,7 +59,7 @@ import Oceananigans.Models.HydrostaticFreeSurfaceModels: compute_barotropic_mode
             @test all((Array(interior(U) .- interior(exact_U))) .< tolerance)
 
             set_v_check(x, y, z) = sin(x * y) * cos((π / 2) * z / Lz)
-            set_V_check(x, y) = sin(x * y) * (sin(0) - (-2 * Lz / (π)))
+            set_V_check(x, y, z) = sin(x * y) * (sin(0) - (-2 * Lz / (π)))
             set!(v, set_v_check)
             exact_V = similar(V)
             set!(exact_V, set_V_check)
@@ -83,7 +82,7 @@ import Oceananigans.Models.HydrostaticFreeSurfaceModels: compute_barotropic_mode
             @test all(Array(interior(U)) .≈ Lz)
 
             set_u_check(x, y, z) = sin(x)
-            set_U_check(x, y) = sin(x) * Lz
+            set_U_check(x, y, z) = sin(x) * Lz
             set!(u, set_u_check)
             exact_U = similar(U)
             set!(exact_U, set_U_check)
@@ -91,7 +90,7 @@ import Oceananigans.Models.HydrostaticFreeSurfaceModels: compute_barotropic_mode
             @test all(Array(interior(U)) .≈ Array(interior(exact_U)))
 
             set_v_check(x, y, z) = sin(x) * z * cos(y)
-            set_V_check(x, y) = -sin(x) * Lz^2 / 2.0 * cos(y)
+            set_V_check(x, y, z) = -sin(x) * Lz^2 / 2.0 * cos(y)
             set!(v, set_v_check)
             exact_V = similar(V)
             set!(exact_V, set_V_check)
@@ -116,7 +115,6 @@ import Oceananigans.Models.HydrostaticFreeSurfaceModels: compute_barotropic_mode
             auxiliary = sefs.auxiliary
             U, V, η̅, U̅, V̅ = state.U, state.V, state.η̅, state.U̅, state.V̅
             Gᵁ, Gⱽ = auxiliary.Gᵁ, auxiliary.Gⱽ
-            Hᶠᶜ, Hᶜᶠ = sefs.auxiliary.Hᶠᶜ, sefs.auxiliary.Hᶜᶠ
 
             u = Field{Face,Center,Center}(grid)
             v = Field{Center,Face,Center}(grid)
@@ -124,22 +122,19 @@ import Oceananigans.Models.HydrostaticFreeSurfaceModels: compute_barotropic_mode
             v_corrected = similar(v)
 
             set_u(x, y, z) = z + Lz / 2 + sin(x)
-            set_U̅(x, y) = cos(x) * Lz
+            set_U̅(x, y, z) = cos(x) * Lz
             set_u_corrected(x, y, z) = z + Lz / 2 + cos(x)
             set!(u, set_u)
             set!(U̅, set_U̅)
             set!(u_corrected, set_u_corrected)
 
             set_v(x, y, z) = (z + Lz / 2) * sin(y) + sin(x)
-            set_V̅(x, y) = (cos(x) + x) * Lz
+            set_V̅(x, y, z) = (cos(x) + x) * Lz
             set_v_corrected(x, y, z) = (z + Lz / 2) * sin(y) + cos(x) + x
             set!(v, set_v)
             set!(V̅, set_V̅)
             set!(v_corrected, set_v_corrected)
 
-            sefs.auxiliary.Hᶠᶜ .= Lz
-            sefs.auxiliary.Hᶜᶠ .= Lz
-
             Δz = zeros(Nz)
             Δz .= grid.Δzᵃᵃᶜ
 
diff --git a/validation/distributed_simulations/distributed_hydrostatic_turbulence.jl b/validation/distributed_simulations/distributed_hydrostatic_turbulence.jl
index 5fbe1c766a..1358bb59cf 100644
--- a/validation/distributed_simulations/distributed_hydrostatic_turbulence.jl
+++ b/validation/distributed_simulations/distributed_hydrostatic_turbulence.jl
@@ -6,6 +6,8 @@ using Statistics
 using Oceananigans.BoundaryConditions
 using Oceananigans.DistributedComputations    
 using Random
+using JLD2
+using Oceananigans.ImmersedBoundaries: ActiveCellsIBG, active_interior_map
 
 # Run with 
 #
@@ -13,10 +15,11 @@ using Random
 #   mpiexec -n 4 julia --project distributed_hydrostatic_turbulence.jl
 # ```
 
-function run_simulation(nx, ny, arch, topo)
-    grid = RectilinearGrid(arch; topology=topo, size=(nx, ny, 1), extent=(4π, 4π, 0.5), halo=(7, 7, 7))
+function run_simulation(nx, ny, arch; topology = (Periodic, Periodic, Bounded))
+    grid = RectilinearGrid(arch; topology, size = (Nx, Ny, 10), extent=(4π, 4π, 0.5), halo=(8, 8, 8))
+    
     bottom(x, y) = (x > π && x < 3π/2 && y > π/2 && y < 3π/2) ? 1.0 : - grid.Lz - 1.0
-    grid = ImmersedBoundaryGrid(grid, GridFittedBottom(bottom))
+    grid = ImmersedBoundaryGrid(grid, GridFittedBottom(bottom); active_cells_map = true)
 
     model = HydrostaticFreeSurfaceModel(; grid,
                                         momentum_advection = VectorInvariant(vorticity_scheme=WENO(order=9)),
@@ -38,9 +41,9 @@ function run_simulation(nx, ny, arch, topo)
     set!(c, mask)
 
     u, v, _ = model.velocities
-    ζ = VerticalVorticityField(model)
+    # ζ = VerticalVorticityField(model)
     η = model.free_surface.η
-    outputs = merge(model.velocities, model.tracers, (; ζ, η))
+    outputs = merge(model.velocities, model.tracers)
 
     progress(sim) = @info "Iteration: $(sim.model.clock.iteration), time: $(sim.model.clock.time), Δt: $(sim.Δt)"
     simulation = Simulation(model, Δt=0.02, stop_time=100.0)
@@ -60,17 +63,13 @@ function run_simulation(nx, ny, arch, topo)
     MPI.Barrier(arch.communicator)
 end
 
-topo = (Periodic, Periodic, Bounded)
+Nx = 32
+Ny = 32
 
-# Use non-uniform partitioning in x, y.
-# TODO: Explain what local_index is.
-nx = [90, 128-90][arch.local_index[1]]
-ny = [56, 128-56][arch.local_index[2]]
-@show arch.local_index
-arch = Distributed(CPU(), topology = topo, ranks=(2, 2, 1)) 
+arch = Distributed(CPU(), partition = Partition(2, 2)) 
 
 # Run the simulation
-run_simulation(nx, ny, arch, topo)
+run_simulation(Nx, Ny, arch)
 
 # Visualize the plane
 # Produce a video for variable `var`
@@ -109,7 +108,6 @@ try
     if MPI.Comm_rank(MPI.COMM_WORLD) == 0
         visualize_simulation("u")
         visualize_simulation("v")
-        visualize_simulation("ζ")
         visualize_simulation("c")
     end
 catch err