-
Notifications
You must be signed in to change notification settings - Fork 86
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
filter out nodes with high queue capacity (#4051)
This change is related to #4049 where instead of queueing locally in each compute node, we try to queue in the requester instead so that jobs are scheduled to new nodes that join, or to the first node that frees up its resources. The current state is we don't filter out nodes if they don't have immediate available capacity or if their queue is growing large. We rank nodes with more capacity higher, but we don't filter out nodes with no capacity. This change allows operators to define `NodeOverSubscriptionFactor` in the requester node to allow it to filter out any compute node with total active and queue capacity beyond the factor. The default is `1.5` which means the compute node can queue locally half of its total capacity in addition to the running capacity. ## Testing This change has been tested with #4049 in dev stack as documented in that issue
- Loading branch information
Showing
20 changed files
with
700 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
package ranking | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
|
||
"github.com/rs/zerolog/log" | ||
|
||
"github.com/bacalhau-project/bacalhau/pkg/lib/validate" | ||
"github.com/bacalhau-project/bacalhau/pkg/models" | ||
"github.com/bacalhau-project/bacalhau/pkg/orchestrator" | ||
) | ||
|
||
type OverSubscriptionNodeRanker struct { | ||
factor float64 | ||
} | ||
|
||
func NewOverSubscriptionNodeRanker(factor float64) (*OverSubscriptionNodeRanker, error) { | ||
err := validate.IsGreaterOrEqual(factor, 1, | ||
"over subscription factor %f must be greater or equal to 1", factor) | ||
if err != nil { | ||
return nil, err | ||
} | ||
return &OverSubscriptionNodeRanker{factor: factor}, nil | ||
} | ||
|
||
// RankNodes ranks nodes based on the ratio of queued capacity to total capacity. | ||
// - Rank -1: If the ratio of is greater than the factor, the node is considered over-subscribed. | ||
// - Rank 0: If the node is not over-subscribed. | ||
func (s *OverSubscriptionNodeRanker) RankNodes( | ||
ctx context.Context, job models.Job, nodes []models.NodeInfo) ([]orchestrator.NodeRank, error) { | ||
jobResourceUsage, err := job.Task().ResourcesConfig.ToResources() | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to convert job resources config to resources: %w", err) | ||
} | ||
|
||
ranks := make([]orchestrator.NodeRank, len(nodes)) | ||
for i, node := range nodes { | ||
var rank int | ||
var reason string | ||
|
||
if node.ComputeNodeInfo == nil || node.ComputeNodeInfo.MaxCapacity.IsZero() { | ||
rank = orchestrator.RankUnsuitable | ||
reason = "node queue usage is unknown" | ||
} else { | ||
// overSubscriptionCapacity is the capacity at which the node can accept more jobs | ||
overSubscriptionCapacity := node.ComputeNodeInfo.MaxCapacity.Multiply(s.factor) | ||
|
||
// totalUsage is the sub of actively running capacity, queued capacity and new job resources | ||
totalUsage := node.ComputeNodeInfo.MaxCapacity. | ||
Sub(node.ComputeNodeInfo.AvailableCapacity). | ||
Add(node.ComputeNodeInfo.QueueUsedCapacity). | ||
Add(*jobResourceUsage) | ||
|
||
if totalUsage.LessThanEq(*overSubscriptionCapacity) { | ||
rank = orchestrator.RankPossible | ||
reason = "node is not over-subscribed" | ||
} else { | ||
rank = orchestrator.RankUnsuitable | ||
reason = "node busy with available capacity " + node.ComputeNodeInfo.AvailableCapacity.String() | ||
if !node.ComputeNodeInfo.QueueUsedCapacity.IsZero() { | ||
reason += " and queue capacity " + node.ComputeNodeInfo.QueueUsedCapacity.String() | ||
} | ||
} | ||
} | ||
|
||
ranks[i] = orchestrator.NodeRank{ | ||
NodeInfo: node, | ||
Rank: rank, | ||
Reason: reason, | ||
Retryable: true, | ||
} | ||
log.Ctx(ctx).Trace().Object("Rank", ranks[i]).Msg("Ranked node") | ||
} | ||
return ranks, nil | ||
} |
Oops, something went wrong.