Scaling improvement for the distributed HydrostaticFreeSurfaceModel (…

…#3404) * full interior map * bugfix * bugfix * bugfixes * hmmm * disambiguate * some organizing * hmmm * improve speed * now we get going * check it out * check bathymetry * fixit * rmove distributed * test it like this * I hope it works! * bugfix * bugfix * bugfix * bugfix * couple of bugfixes * bugfix * bugfixes * changes * try like this * some tests... * show the coordinate * bugfix * bugfix * test this hypothesis * another test * bugfix * other bugfix * now we'll see... * now it will work hopefully * all bugs fixed? * bugfix * remove the shows * unroll the loop * fully unrolled * split explicit loop unrolling * update * annotations * using NVTX * add NVTX * bugfix * bugfix * utils * try like this * text like this * remove reduced fields * small test * small change * nvtx on fill halos * all NVTX * fill it all * check it out * bugfixxed * bugfixed * bugfix * annotate the convert * bugfix * bugfix * add cudaconvert * remove NVTX * model grid * try like this? * bugfix * fix * should work? * add here * add here * bugfix * back to how it was * try it like this maybe? * convert * fixxing * try it now? * bugfix * add distributed * bugfix * allow unrolling * convert in archs * bugfix * just for testing * removed useless particles * removed bacthed stuff * tracer advetion type * bugfix * bugfix * other bugfix * other small bugfix * first bugfix * correct error * some bugfixes * bugfix * slightly more optim * simplifying more * all tests should be ok * try it * correct for last time * try again * fixed * tests fixxed * finally tests fixed * back to previous dt * bugfix * tests fixed? * ale * Update src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl Co-authored-by: Navid C. Constantinou <[email protected]> * Update src/TimeSteppers/quasi_adams_bashforth_2.jl Co-authored-by: Navid C. Constantinou <[email protected]> * removed NVTX * remove one line * if inside * better comment * some docstrings * remove NVTX * test an hypothesis * test it now * change name to ZColumnMap() * changed some function names * add comment * bugfix in warning * bugfix * some problems with distributed * apparently no more splatting * and also this splatting * removes kernel splatting * Update src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl Co-authored-by: Gregory L. Wagner <[email protected]> * Update tracer_advection_operators.jl * Some minor clarifications * remove grid from ab2step * bugfix * changing some names --------- Co-authored-by: Navid C. Constantinou <[email protected]> Co-authored-by: Gregory L. Wagner <[email protected]>
CliMA · Feb 27, 2024 · 0757379 · 0757379
1 parent 17ab145
commit 0757379
Show file tree

Hide file tree

Showing 33 changed files with 754 additions and 459 deletions.
diff --git a/ext/OceananigansEnzymeExt.jl b/ext/OceananigansEnzymeExt.jl
@@ -228,7 +228,7 @@ function EnzymeCore.EnzymeRules.augmented_primal(config,
                                                  include_right_boundaries = false,
                                                  reduced_dimensions = (),
                                                  location = nothing,
-                                                 only_active_cells = nothing,
+                                                 active_cells_map = nothing,
                                                  kwargs...) where N
 
 
@@ -239,8 +239,8 @@ function EnzymeCore.EnzymeRules.augmented_primal(config,
 
     offset = Oceananigans.Utils.offsets(workspec.val)
 
-    if !isnothing(only_active_cells) 
-        workgroup, worksize = Oceananigans.Utils.active_cells_work_layout(workgroup, worksize, only_active_cells, grid.val) 
+    if !isnothing(active_cells_map) 
+        workgroup, worksize = Oceananigans.Utils.active_cells_work_layout(workgroup, worksize, active_cells_map, grid.val) 
         offset = nothing
     end
 
@@ -286,7 +286,7 @@ function EnzymeCore.EnzymeRules.reverse(config::EnzymeCore.EnzymeRules.ConfigWid
                                                  include_right_boundaries = false,
                                                  reduced_dimensions = (),
                                                  location = nothing,
-                                                 only_active_cells = nothing,
+                                                 active_cells_map = nothing,
                                                  kwargs...) where N
 
   subrets = if tape !== nothing

diff --git a/src/Advection/tracer_advection_operators.jl b/src/Advection/tracer_advection_operators.jl
@@ -1,6 +1,31 @@
 using Oceananigans.Operators: Vᶜᶜᶜ
 using Oceananigans.Fields: ZeroField
 
+struct TracerAdvection{N, FT, A, B, C} <: AbstractAdvectionScheme{N, FT}
+    x :: A
+    y :: B
+    z :: C
+
+    TracerAdvection{N, FT}(x::A, y::B, z::C) where {N, FT, A, B, C} = new{N, FT, A, B, C}(x, y, z)
+end
+
+"""
+    function TracerAdvection(x, y, z)
+
+Builds a `TracerAdvection` type with reconstructions schemes `x`, `y`, and `z` to be applied in
+the x, y, and z direction, respectively.
+"""
+function TracerAdvection(x_advection, y_advection, z_advection)
+    Hx = required_halo_size(x_advection)
+    Hy = required_halo_size(y_advection)
+    Hz = required_halo_size(z_advection)
+
+    FT = eltype(x_advection)
+    H = max(Hx, Hy, Hz)
+
+    return TracerAdvection{H, FT}(x_advection, y_advection, z_advection)
+end
+
 @inline _advective_tracer_flux_x(args...) = advective_tracer_flux_x(args...)
 @inline _advective_tracer_flux_y(args...) = advective_tracer_flux_y(args...)
 @inline _advective_tracer_flux_z(args...) = advective_tracer_flux_z(args...)
@@ -32,3 +57,9 @@ which ends up at the location `ccc`.
                                     δyᵃᶜᵃ(i, j, k, grid, _advective_tracer_flux_y, advection, U.v, c) +
                                     δzᵃᵃᶜ(i, j, k, grid, _advective_tracer_flux_z, advection, U.w, c))
 end
+
+@inline function div_Uc(i, j, k, grid, advection::TracerAdvection, U, c)
+    return 1/Vᶜᶜᶜ(i, j, k, grid) * (δxᶜᵃᵃ(i, j, k, grid, _advective_tracer_flux_x, advection.x, U.u, c) +
+                                    δyᵃᶜᵃ(i, j, k, grid, _advective_tracer_flux_y, advection.y, U.v, c) +
+                                    δzᵃᵃᶜ(i, j, k, grid, _advective_tracer_flux_z, advection.z, U.w, c))
+end
diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
@@ -111,9 +111,9 @@ Vector Invariant, Dimension-by-dimension reconstruction
 function VectorInvariant(; vorticity_scheme = EnstrophyConserving(),
                            vorticity_stencil = VelocityStencil(),
                            vertical_scheme = EnergyConserving(),
-                           kinetic_energy_gradient_scheme = vertical_scheme,
                            divergence_scheme = vertical_scheme,
-                           upwinding  = OnlySelfUpwinding(; cross_scheme = vertical_scheme),
+                           kinetic_energy_gradient_scheme = divergence_scheme,
+                           upwinding  = OnlySelfUpwinding(; cross_scheme = divergence_scheme),
                            multi_dimensional_stencil = false)
 
     N = required_halo_size(vorticity_scheme)
@@ -144,10 +144,10 @@ const VectorInvariantKEGradientEnergyConserving = VectorInvariant{<:Any, <:Any,
 const VectorInvariantKineticEnergyUpwinding     = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme}
 
 
-#                                                 VectorInvariant{N,     FT,    M,     Z,     ZS,    V,                                     K,     D,     U (upwinding)
-const VectorInvariantCrossVerticalUpwinding     = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:Any, <:CrossAndSelfUpwinding}
-const VectorInvariantSelfVerticalUpwinding      = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:Any, <:OnlySelfUpwinding}
-const VectorInvariantVelocityVerticalUpwinding  = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:Any, <:VelocityUpwinding}
+#                                                 VectorInvariant{N,     FT,    M,     Z,     ZS,     V,     K,     D,                                     U (upwinding)
+const VectorInvariantCrossVerticalUpwinding     = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any,  <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:CrossAndSelfUpwinding}
+const VectorInvariantSelfVerticalUpwinding      = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any,  <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:OnlySelfUpwinding}
+const VectorInvariantVelocityVerticalUpwinding  = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any,  <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:VelocityUpwinding}
 
 Base.summary(a::VectorInvariant)                 = string("Vector Invariant, Dimension-by-dimension reconstruction")
 Base.summary(a::MultiDimensionalVectorInvariant) = string("Vector Invariant, Multidimensional reconstruction")
@@ -166,10 +166,7 @@ Base.show(io::IO, a::VectorInvariant{N, FT}) where {N, FT} =
 ##### Convenience for WENO Vector Invariant
 #####
 
-#                           VectorInvariant{N,     FT,    M,     Z (vorticity scheme), ZS,    V (vertical scheme),      K (kinetic energy gradient scheme)
-const WENOVectorInvariant = VectorInvariant{<:Any, <:Any, <:Any, <:WENO, <:Any, <:WENO, <:WENO}
-
-nothing_to_default(user_value, default) = isnothing(user_value) ? default : user_value
+nothing_to_default(user_value; default) = isnothing(user_value) ? default : user_value
 
 """
     WENOVectorInvariant(; upwinding = nothing,
@@ -188,23 +185,23 @@ function WENOVectorInvariant(; upwinding = nothing,
                                weno_kw...)
 
     if isnothing(order) # apply global defaults
-        vorticity_order               = nothing_to_default(vorticity_order, default=9)
-        vertical_order                = nothing_to_default(vertical_order, default=5)
-        divergence_order              = nothing_to_default(divergence_order, default=5)
-        kinetic_energy_gradient_order = nothing_to_default(kinetic_energy_gradient_order, default=5)
+        vorticity_order               = nothing_to_default(vorticity_order,  default = 9)
+        vertical_order                = nothing_to_default(vertical_order,   default = 5)
+        divergence_order              = nothing_to_default(divergence_order, default = 5)
+        kinetic_energy_gradient_order = nothing_to_default(kinetic_energy_gradient_order, default = 5)
     else # apply user supplied `order` unless overridden by more specific value
-        vorticity_order               = nothing_to_default(vorticity_order, default=order)
-        vertical_order                = nothing_to_default(vertical_order, default=order)
-        divergence_order              = nothing_to_default(divergence_order, default=order)
-        kinetic_energy_gradient_order = nothing_to_default(kinetic_energy_gradient_order, default=order)
+        vorticity_order               = nothing_to_default(vorticity_order,  default = order)
+        vertical_order                = nothing_to_default(vertical_order,   default = order)
+        divergence_order              = nothing_to_default(divergence_order, default = order)
+        kinetic_energy_gradient_order = nothing_to_default(kinetic_energy_gradient_order, default = order)
     end
 
-    vorticity_scheme               = WENO(; order=vorticity_order, weno_kw...)
-    vertical_scheme                = WENO(; order=vertical_order, weno_kw...)
-    kinetic_energy_gradient_scheme = WENO(; order=kinetic_energy_gradient_order, weno_kw...)
-    divergence_scheme              = WENO(; order=divergence_order, weno_kw...)
+    vorticity_scheme               = WENO(; order = vorticity_order, weno_kw...)
+    vertical_scheme                = WENO(; order = vertical_order, weno_kw...)
+    kinetic_energy_gradient_scheme = WENO(; order = kinetic_energy_gradient_order, weno_kw...)
+    divergence_scheme              = WENO(; order = divergence_order, weno_kw...)
 
-    default_upwinding = OnlySelfUpwinding(cross_scheme=divergence_scheme)
+    default_upwinding = OnlySelfUpwinding(cross_scheme = divergence_scheme)
     upwinding = nothing_to_default(upwinding; default = default_upwinding)
 
     schemes = (vorticity_scheme, vertical_scheme, kinetic_energy_gradient_scheme, divergence_scheme)

diff --git a/src/Architectures.jl b/src/Architectures.jl
@@ -115,4 +115,10 @@ end
 @inline unsafe_free!(a::CuArray) = CUDA.unsafe_free!(a)
 @inline unsafe_free!(a)          = nothing
 
+# Convert arguments to GPU-compatible types
+@inline convert_args(::CPU, args) = args
+@inline convert_args(::GPU, args) = CUDA.cudaconvert(args)
+@inline convert_args(::GPU, args::Tuple) = map(CUDA.cudaconvert, args)
+
 end # module
+
diff --git a/src/DistributedComputations/distributed_architectures.jl b/src/DistributedComputations/distributed_architectures.jl
@@ -2,7 +2,7 @@ using Oceananigans.Architectures
 using Oceananigans.Grids: topology, validate_tupled_argument
 using CUDA: ndevices, device!
 
-import Oceananigans.Architectures: device, cpu_architecture, arch_array, array_type, child_architecture
+import Oceananigans.Architectures: device, cpu_architecture, arch_array, array_type, child_architecture, convert_args
 import Oceananigans.Grids: zeros
 import Oceananigans.Utils: sync_device!, tupleit
 
@@ -210,7 +210,7 @@ function Distributed(child_architecture = CPU();
                      partition = Partition(MPI.Comm_size(communicator)))
 
     if !(MPI.Initialized())
-        @info "MPI has not been initialized, so we are calling MPI.Init()".
+        @info "MPI has not been initialized, so we are calling MPI.Init()."
         MPI.Init()
     end
 
@@ -265,6 +265,7 @@ arch_array(arch::Distributed, A)      = arch_array(child_architecture(arch), A)
 zeros(FT, arch::Distributed, N...)    = zeros(FT, child_architecture(arch), N...)
 array_type(arch::Distributed)         = array_type(child_architecture(arch))
 sync_device!(arch::Distributed)       = sync_device!(arch.child_architecture)
+convert_args(arch::Distributed, arg)  = convert_args(child_architecture(arch), arg)
 
 cpu_architecture(arch::DistributedCPU) = arch
 cpu_architecture(arch::Distributed{A, S}) where {A, S} = 

diff --git a/src/DistributedComputations/distributed_fields.jl b/src/DistributedComputations/distributed_fields.jl
@@ -39,16 +39,19 @@ end
 function set!(u::DistributedField, v::Union{Array, CuArray})
     gsize = global_size(architecture(u), size(u))
 
-    if size(v) == size(u)
-        f = arch_array(architecture(u), v)
-        u .= f
-        return u
-    elseif size(v) == gsize
+    if size(v) == gsize
         f = partition_global_array(architecture(u), v, size(u))
         u .= f
         return u
     else
-        throw(ArgumentError("ERROR: DimensionMismatch: array could not be set to match destination field"))
+        try
+            f = arch_array(architecture(u), v)
+            u .= f
+            return u
+
+        catch
+            throw(ArgumentError("ERROR: DimensionMismatch: array could not be set to match destination field"))
+        end
     end
 end
 

diff --git a/src/DistributedComputations/distributed_grids.jl b/src/DistributedComputations/distributed_grids.jl
@@ -41,8 +41,9 @@ end
 @inline local_sizes(N, R::Fractional) = Tuple(ceil(Int, N * r) for r in R.sizes)
 @inline function local_sizes(N, R::Sizes)
     if N != sum(R.sizes)
-        @warn "The domain size specified in the architecture $(R.sizes) is inconsistent 
-               with the grid size $N: using the architecture-specified size"
+        @warn "The Sizes specified in the architecture $(R.sizes) is inconsistent  
+               with the grid size: (N = $N != sum(Sizes) = $(sum(R.sizes))). 
+               Using $(R.sizes)..."
     end
     return R.sizes
 end
@@ -80,9 +81,9 @@ function RectilinearGrid(arch::Distributed,
     TY = insert_connected_topology(topology[2], Ry, rj)
     TZ = insert_connected_topology(topology[3], Rz, rk)
 
-    xl = partition(x, nx, arch, 1)
-    yl = partition(y, ny, arch, 2)
-    zl = partition(z, nz, arch, 3)
+    xl = Rx == 1 ? x : partition_coordinate(x, nx, arch, 1)
+    yl = Ry == 1 ? y : partition_coordinate(y, ny, arch, 2)
+    zl = Rz == 1 ? z : partition_coordinate(z, nz, arch, 3)
 
     Lx, xᶠᵃᵃ, xᶜᵃᵃ, Δxᶠᵃᵃ, Δxᶜᵃᵃ = generate_coordinate(FT, topology[1](), nx, Hx, xl, :x, child_architecture(arch))
     Ly, yᵃᶠᵃ, yᵃᶜᵃ, Δyᵃᶠᵃ, Δyᵃᶜᵃ = generate_coordinate(FT, topology[2](), ny, Hy, yl, :y, child_architecture(arch))
@@ -127,9 +128,9 @@ function LatitudeLongitudeGrid(arch::Distributed,
     TY = insert_connected_topology(topology[2], Ry, rj)
     TZ = insert_connected_topology(topology[3], Rz, rk)
 
-    λl = partition(longitude, nλ, arch, 1)
-    φl = partition(latitude,  nφ, arch, 2)
-    zl = partition(z,         nz, arch, 3)
+    λl = Rx == 1 ? longitude : partition_coordinate(longitude, nλ, arch, 1)
+    φl = Ry == 1 ? latitude  : partition_coordinate(latitude,  nφ, arch, 2)
+    zl = Rz == 1 ? z         : partition_coordinate(z,         nz, arch, 3)
 
     # Calculate all direction (which might be stretched)
     # A direction is regular if the domain passed is a Tuple{<:Real, <:Real}, 
@@ -186,9 +187,9 @@ function reconstruct_global_grid(grid::DistributedRectilinearGrid)
     z = cpu_face_constructor_z(grid)
 
     ## This will not work with 3D parallelizations!!
-    xG = Rx == 1 ? x : assemble(x, nx, Rx, ri, rj, rk, arch.communicator)
-    yG = Ry == 1 ? y : assemble(y, ny, Ry, rj, ri, rk, arch.communicator)
-    zG = Rz == 1 ? z : assemble(z, nz, Rz, rk, ri, rj, arch.communicator)
+    xG = Rx == 1 ? x : assemble_coordinate(x, nx, Rx, ri, rj, rk, arch.communicator)
+    yG = Ry == 1 ? y : assemble_coordinate(y, ny, Ry, rj, ri, rk, arch.communicator)
+    zG = Rz == 1 ? z : assemble_coordinate(z, nz, Rz, rk, ri, rj, arch.communicator)
 
     child_arch = child_architecture(arch)
 
@@ -229,9 +230,9 @@ function reconstruct_global_grid(grid::DistributedLatitudeLongitudeGrid)
     z = cpu_face_constructor_z(grid)
 
     ## This will not work with 3D parallelizations!!
-    λG = Rx == 1 ? λ : assemble(λ, nλ, Rx, ri, rj, rk, arch.communicator)
-    φG = Ry == 1 ? φ : assemble(φ, nφ, Ry, rj, ri, rk, arch.communicator)
-    zG = Rz == 1 ? z : assemble(z, nz, Rz, rk, ri, rj, arch.communicator)
+    λG = Rx == 1 ? λ : assemble_coordinate(λ, nλ, Rx, ri, rj, rk, arch.communicator)
+    φG = Ry == 1 ? φ : assemble_coordinate(φ, nφ, Ry, rj, ri, rk, arch.communicator)
+    zG = Rz == 1 ? z : assemble_coordinate(z, nz, Rz, rk, ri, rj, arch.communicator)
 
     child_arch = child_architecture(arch)
 
@@ -340,4 +341,4 @@ function reconstruct_global_topology(T, R, r, r1, r2, comm)
     else
         return Bounded
     end
-end
+end
diff --git a/src/DistributedComputations/halo_communication.jl b/src/DistributedComputations/halo_communication.jl
@@ -142,11 +142,10 @@ end
 # corner passing routine
 function fill_corners!(c, connectivity, indices, loc, arch, grid, buffers, args...; async = false, only_local_halos = false, kwargs...)
 
-    if only_local_halos # No corner filling needed!
-        return nothing
-    end
+    # No corner filling needed!
+    only_local_halos && return nothing
 
-    # This has to be synchronized!!
+    # This has to be synchronized!
     fill_send_buffers!(c, buffers, grid, Val(:corners))
     sync_device!(arch)
 
@@ -189,7 +188,6 @@ function fill_halo_event!(c, fill_halos!, bcs, indices, loc, arch, grid::Distrib
 
     if !only_local_halos # Then we need to fill the `send` buffers
         fill_send_buffers!(c, buffers, grid, Val(buffer_side))
-        sync_device!(arch)
     end
 
     # Calculate size and offset of the fill_halo kernel
@@ -201,7 +199,7 @@ function fill_halo_event!(c, fill_halos!, bcs, indices, loc, arch, grid::Distrib
     requests = fill_halos!(c, bcs..., size, offset, loc, arch, grid, buffers, args...; only_local_halos, kwargs...)
 
     pool_requests_or_complete_comm!(c, arch, grid, buffers, requests, async, buffer_side)
-
+    
     return nothing
 end
 
@@ -245,6 +243,8 @@ for (side, opposite_side) in zip([:west, :south], [:east, :north])
                                   grid::DistributedGrid, buffers, args...; only_local_halos = false, kwargs...)
 
             only_local_halos && return nothing
+
+            sync_device!(arch)
 
             @assert bc_side.condition.from == bc_opposite_side.condition.from  # Extra protection in case of bugs
             local_rank = bc_side.condition.from
@@ -275,11 +275,12 @@ for side in [:west, :east, :south, :north]
 
             only_local_halos && return nothing
 
+            sync_device!(arch)
+
             child_arch = child_architecture(arch)
             local_rank = bc_side.condition.from
 
             recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc, local_rank, bc_side.condition.to, buffers)
-
             send_req = $send_side_halo(c, grid, arch, loc, local_rank, bc_side.condition.to, buffers)
 
             return [send_req, recv_req]

diff --git a/src/DistributedComputations/partition_assemble.jl b/src/DistributedComputations/partition_assemble.jl
@@ -37,14 +37,19 @@ end
 
 # Partitioning (localization of global objects) and assembly (global assembly of local objects)
 # Used for grid constructors (cpu_face_constructor_x, cpu_face_constructor_y, cpu_face_constructor_z)
-# which means that we need to repeat the value at the right boundary
-function partition(c::AbstractVector, n, arch, idx)
+# We need to repeat the value at the right boundary
+function partition_coordinate(c::AbstractVector, n, arch, idx)
     nl = concatenate_local_sizes(n, arch, idx)
     r  = arch.local_index[idx]
-    return c[1 + sum(nl[1:r-1]) : sum(nl[1:r])]
+    # Allow for Face values
+    if r == arch.ranks[idx]
+        return c[1 + sum(nl[1:r-1]) : end]
+    else
+        return c[1 + sum(nl[1:r-1]) : sum(nl[1:r])]
+    end
 end
 
-function partition(c::Tuple, n, arch, idx)
+function partition_coordinate(c::Tuple, n, arch, idx)
     nl = concatenate_local_sizes(n, arch, idx)
     N  = sum(nl)
     R  = arch.ranks[idx]
@@ -60,14 +65,14 @@ function partition(c::Tuple, n, arch, idx)
 end
 
 """
-    assemble(c::AbstractVector, n, R, r, r1, r2, comm) 
+    assemble_coordinate(c::AbstractVector, n, R, r, r1, r2, comm) 
 
 Builds a linear global coordinate vector given a local coordinate vector `c_local`
 a local number of elements `Nc`, number of ranks `Nr`, rank `r`,
 and `arch`itecture. Since we use a global reduction, only ranks at positions
 1 in the other two directions `r1 == 1` and `r2 == 1` fill the 1D array.
 """
-function assemble(c_local::AbstractVector, n, R, r, r1, r2, comm) 
+function assemble_coordinate(c_local::AbstractVector, n, R, r, r1, r2, comm) 
     nl = concatenate_local_sizes(n, R, r)
 
     c_global = zeros(eltype(c_local), sum(nl)+1)
@@ -83,7 +88,7 @@ function assemble(c_local::AbstractVector, n, R, r, r1, r2, comm)
 end
 
 # Simple case, just take the first and the last core
-function assemble(c::Tuple, n, R, r, r1, r2, comm) 
+function assemble_coordinate(c::Tuple, n, R, r, r1, r2, comm) 
     c_global = zeros(Float64, 2)
 
     if r == 1 && r1 == 1 && r2 == 1

diff --git a/src/Grids/grid_generation.jl b/src/Grids/grid_generation.jl
@@ -13,6 +13,7 @@ get_face_node(coord::Function, i) = coord(i)
 get_face_node(coord::AbstractVector, i) = CUDA.@allowscalar coord[i]
 
 const AT = AbstractTopology
+
 lower_exterior_Δcoordᶠ(::AT,              Fi, Hcoord) = [Fi[end - Hcoord + i] - Fi[end - Hcoord + i - 1] for i = 1:Hcoord]
 lower_exterior_Δcoordᶠ(::BoundedTopology, Fi, Hcoord) = [Fi[2]  - Fi[1] for _ = 1:Hcoord]