race safe (#19)

* now race safe * enable race in Github pipeline * Microseconds isn't enough on github pipeline * invoke cancelFunc to avoid context leak
Azure · Sep 6, 2022 · c35056b · c35056b
1 parent c59d69d
commit c35056b
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 79 deletions.
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
@@ -26,7 +26,7 @@ jobs:
       run: go build -v .
 
     - name: Test
-      run: go test -coverprofile=coverage.txt -covermode=atomic ./...
+      run: go test -race -coverprofile=coverage.txt -covermode=atomic ./...
 
     - name: Codecov
       uses: codecov/[email protected]
diff --git a/error_test.go b/error_test.go
@@ -29,8 +29,8 @@ func TestTimeoutCase(t *testing.T) {
 	ctx, cancelFunc := newTestContextWithTimeout(t, 3*time.Second)
 	defer cancelFunc()
 
-	tsk := asynctask.Start(ctx, getCountingTask(10, 200*time.Millisecond))
-	_, err := tsk.WaitWithTimeout(ctx, 300*time.Millisecond)
+	tsk := asynctask.Start(ctx, getCountingTask(10, "timeoutCase", countingTaskDefaultStepLatency))
+	_, err := tsk.WaitWithTimeout(ctx, 30*time.Millisecond)
 	assert.True(t, errors.Is(err, context.DeadlineExceeded), "expecting DeadlineExceeded")
 
 	// the last Wait error should affect running task

diff --git a/task.go b/task.go
@@ -19,30 +19,33 @@ type Task[T any] struct {
 	err        error
 	cancelFunc context.CancelFunc
 	waitGroup  *sync.WaitGroup
+	mutex      *sync.Mutex
 }
 
 // State return state of the task.
 func (t *Task[T]) State() State {
+	t.mutex.Lock()
+	defer t.mutex.Unlock()
 	return t.state
 }
 
-// Cancel abort the task execution
-// !! only if the function provided handles context cancel.
-func (t *Task[T]) Cancel() {
-	if !t.state.IsTerminalState() {
-		t.cancelFunc()
-
-		var result T
-		t.finish(StateCanceled, &result, ErrCanceled)
+// Cancel the task by cancel the context.
+// !! this rely on the task function to check context cancellation and proper context handling.
+func (t *Task[T]) Cancel() bool {
+	if !t.finished() {
+		t.finish(StateCanceled, nil, ErrCanceled)
+		return true
 	}
+
+	return false
 }
 
 // Wait block current thread/routine until task finished or failed.
 // context passed in can terminate the wait, through context cancellation
 // but won't terminate the task (unless it's same context)
 func (t *Task[T]) Wait(ctx context.Context) error {
 	// return immediately if task already in terminal state.
-	if t.state.IsTerminalState() {
+	if t.finished() {
 		return t.err
 	}
 
@@ -64,7 +67,7 @@ func (t *Task[T]) Wait(ctx context.Context) error {
 // timeout only stop waiting, taks will remain running.
 func (t *Task[T]) WaitWithTimeout(ctx context.Context, timeout time.Duration) (*T, error) {
 	// return immediately if task already in terminal state.
-	if t.state.IsTerminalState() {
+	if t.finished() {
 		return t.result, t.err
 	}
 
@@ -90,13 +93,14 @@ func Start[T any](ctx context.Context, task AsyncFunc[T]) *Task[T] {
 	ctx, cancel := context.WithCancel(ctx)
 	wg := &sync.WaitGroup{}
 	wg.Add(1)
+	mutex := &sync.Mutex{}
 
-	var result T
 	record := &Task[T]{
 		state:      StateRunning,
-		result:     &result,
+		result:     nil,
 		cancelFunc: cancel,
 		waitGroup:  wg,
+		mutex:      mutex,
 	}
 
 	go runAndTrackGenericTask(ctx, record, task)
@@ -113,6 +117,7 @@ func NewCompletedTask[T any](value *T) *Task[T] {
 		// nil cancelFunc and waitGroup should be protected with IsTerminalState()
 		cancelFunc: nil,
 		waitGroup:  nil,
+		mutex:      &sync.Mutex{},
 	}
 }
 
@@ -138,9 +143,18 @@ func runAndTrackGenericTask[T any](ctx context.Context, record *Task[T], task fu
 
 func (t *Task[T]) finish(state State, result *T, err error) {
 	// only update state and result if not yet canceled
-	if !t.state.IsTerminalState() {
+	if !t.finished() {
+		t.mutex.Lock()
+		t.cancelFunc() // cancel the context
 		t.state = state
 		t.result = result
 		t.err = err
+		t.mutex.Unlock()
 	}
 }
+
+func (t *Task[T]) finished() bool {
+	t.mutex.Lock()
+	defer t.mutex.Unlock()
+	return t.state.IsTerminalState()
+}
diff --git a/task_test.go b/task_test.go
@@ -2,6 +2,7 @@ package asynctask_test
 
 import (
 	"context"
+	"fmt"
 	"testing"
 	"time"
 
@@ -10,6 +11,7 @@ import (
 )
 
 const testContextKey string = "testing"
+const countingTaskDefaultStepLatency time.Duration = 20 * time.Millisecond
 
 func newTestContext(t *testing.T) context.Context {
 	return context.WithValue(context.TODO(), testContextKey, t)
@@ -19,18 +21,20 @@ func newTestContextWithTimeout(t *testing.T, timeout time.Duration) (context.Con
 	return context.WithTimeout(context.WithValue(context.TODO(), testContextKey, t), timeout)
 }
 
-func getCountingTask(countTo int, sleepInterval time.Duration) asynctask.AsyncFunc[int] {
+func getCountingTask(countTo int, taskId string, sleepInterval time.Duration) asynctask.AsyncFunc[int] {
 	return func(ctx context.Context) (*int, error) {
 		t := ctx.Value(testContextKey).(*testing.T)
 
 		result := 0
 		for i := 0; i < countTo; i++ {
 			select {
 			case <-time.After(sleepInterval):
-				t.Logf("  working %d", i)
+				t.Logf("[%s]: counting %d", taskId, i)
 				result = i
 			case <-ctx.Done():
-				t.Log("work canceled")
+				// testing.Logf would cause DataRace error when test is already finished: https://github.com/golang/go/issues/40343
+				// leave minor time buffer before exit test to finish this last logging at least.
+				t.Logf("[%s]: work canceled", taskId)
 				return &result, nil
 			}
 		}
@@ -43,7 +47,7 @@ func TestEasyGenericCase(t *testing.T) {
 	ctx, cancelFunc := newTestContextWithTimeout(t, 3*time.Second)
 	defer cancelFunc()
 
-	t1 := asynctask.Start(ctx, getCountingTask(10, 200*time.Millisecond))
+	t1 := asynctask.Start(ctx, getCountingTask(10, "easyTask", countingTaskDefaultStepLatency))
 	assert.Equal(t, asynctask.StateRunning, t1.State(), "Task should queued to Running")
 
 	rawResult, err := t1.Result(ctx)
@@ -62,26 +66,27 @@ func TestEasyGenericCase(t *testing.T) {
 	assert.NotNil(t, rawResult)
 	assert.Equal(t, *rawResult, 9)
 
-	assert.True(t, elapsed.Microseconds() < 3, "Second wait should return immediately")
+	// Result should be returned immediately
+	assert.True(t, elapsed.Milliseconds() < 1, fmt.Sprintf("Second wait should have return immediately: %s", elapsed))
 }
 
 func TestCancelFuncOnGeneric(t *testing.T) {
 	t.Parallel()
 	ctx, cancelFunc := newTestContextWithTimeout(t, 3*time.Second)
 	defer cancelFunc()
 
-	t1 := asynctask.Start(ctx, getCountingTask(10, 200*time.Millisecond))
+	t1 := asynctask.Start(ctx, getCountingTask(10, "cancelTask", countingTaskDefaultStepLatency))
 	assert.Equal(t, asynctask.StateRunning, t1.State(), "Task should queued to Running")
 
-	time.Sleep(time.Second * 1)
+	time.Sleep(countingTaskDefaultStepLatency)
 	t1.Cancel()
 
 	_, err := t1.Result(ctx)
 	assert.Equal(t, asynctask.ErrCanceled, err, "should return reason of error")
 	assert.Equal(t, asynctask.StateCanceled, t1.State(), "Task should remain in cancel state")
 
 	// I can cancel again, and nothing changes
-	time.Sleep(time.Second * 1)
+	time.Sleep(countingTaskDefaultStepLatency)
 	t1.Cancel()
 	_, err = t1.Result(ctx)
 	assert.Equal(t, asynctask.ErrCanceled, err, "should return reason of error")
@@ -101,11 +106,11 @@ func TestConsistentResultAfterCancelGenericTask(t *testing.T) {
 	ctx, cancelFunc := newTestContextWithTimeout(t, 3*time.Second)
 	defer cancelFunc()
 
-	t1 := asynctask.Start(ctx, getCountingTask(10, 200*time.Millisecond))
-	t2 := asynctask.Start(ctx, getCountingTask(10, 200*time.Millisecond))
+	t1 := asynctask.Start(ctx, getCountingTask(10, "consistentTask1", countingTaskDefaultStepLatency))
+	t2 := asynctask.Start(ctx, getCountingTask(10, "consistentTask2", countingTaskDefaultStepLatency))
 	assert.Equal(t, asynctask.StateRunning, t1.State(), "Task should queued to Running")
 
-	time.Sleep(time.Second * 1)
+	time.Sleep(countingTaskDefaultStepLatency)
 	start := time.Now()
 	t1.Cancel()
 	duration := time.Since(start)
@@ -150,22 +155,24 @@ func TestCrazyCaseGeneric(t *testing.T) {
 	ctx, cancelFunc := newTestContextWithTimeout(t, 3*time.Second)
 	defer cancelFunc()
 
-	numOfTasks := 8000 // if you have --race switch on: limit on 8128 simultaneously alive goroutines is exceeded, dying
+	numOfTasks := 100 // if you have --race switch on: limit on 8128 simultaneously alive goroutines is exceeded, dying
 	tasks := map[int]*asynctask.Task[int]{}
 	for i := 0; i < numOfTasks; i++ {
-		tasks[i] = asynctask.Start(ctx, getCountingTask(10, 200*time.Millisecond))
+		tasks[i] = asynctask.Start(ctx, getCountingTask(10, fmt.Sprintf("CrazyTask%d", i), countingTaskDefaultStepLatency))
 	}
 
-	time.Sleep(200 * time.Millisecond)
+	// sleep 1 step, then cancel task with even number
+	time.Sleep(countingTaskDefaultStepLatency)
 	for i := 0; i < numOfTasks; i += 2 {
 		tasks[i].Cancel()
 	}
 
+	time.Sleep(time.Duration(numOfTasks) * 2 * time.Microsecond)
 	for i := 0; i < numOfTasks; i += 1 {
 		rawResult, err := tasks[i].Result(ctx)
 
 		if i%2 == 0 {
-			assert.Equal(t, asynctask.ErrCanceled, err, "should be canceled")
+			assert.Equal(t, asynctask.ErrCanceled, err, fmt.Sprintf("task %s should be canceled, but it finished with %+v", fmt.Sprintf("CrazyTask%d", i), rawResult))
 			assert.Equal(t, *rawResult, 0)
 		} else {
 			assert.NoError(t, err)

diff --git a/wait_all.go b/wait_all.go
@@ -3,7 +3,6 @@ package asynctask
 import (
 	"context"
 	"fmt"
-	"sync"
 )
 
 type Waitable interface {
@@ -21,14 +20,14 @@ type WaitAllOptions struct {
 func WaitAll(ctx context.Context, options *WaitAllOptions, tasks ...Waitable) error {
 	tasksCount := len(tasks)
 
-	mutex := sync.Mutex{}
-	errorChClosed := false
 	errorCh := make(chan error, tasksCount)
-	// hard close channel
-	defer close(errorCh)
+	// when failFast enabled, we return on first error we see, while other task may still post error in this channel.
+	if !options.FailFast {
+		defer close(errorCh)
+	}
 
 	for _, tsk := range tasks {
-		go waitOne(ctx, tsk, errorCh, &errorChClosed, &mutex)
+		go waitOne(ctx, tsk, errorCh)
 	}
 
 	runningTasks := tasksCount
@@ -40,20 +39,17 @@ func WaitAll(ctx context.Context, options *WaitAllOptions, tasks ...Waitable) er
 			if err != nil {
 				// return immediately after receive first error.
 				if options.FailFast {
-					softCloseChannel(&mutex, &errorChClosed)
 					return err
 				}
 
 				errList = append(errList, err)
 			}
 		case <-ctx.Done():
-			softCloseChannel(&mutex, &errorChClosed)
-			return fmt.Errorf("WaitAll context canceled: %w", ctx.Err())
+			return fmt.Errorf("WaitAll %w", ctx.Err())
 		}
 
 		// are we finished yet?
 		if runningTasks == 0 {
-			softCloseChannel(&mutex, &errorChClosed)
 			break
 		}
 	}
@@ -69,23 +65,7 @@ func WaitAll(ctx context.Context, options *WaitAllOptions, tasks ...Waitable) er
 	return nil
 }
 
-func waitOne(ctx context.Context, tsk Waitable, errorCh chan<- error, errorChClosed *bool, mutex *sync.Mutex) {
+func waitOne(ctx context.Context, tsk Waitable, errorCh chan<- error) {
 	err := tsk.Wait(ctx)
-
-	// why mutex?
-	// if all tasks start using same context (unittest is good example)
-	// and that context got canceled, all task fail at same time.
-	// first one went in and close the channel, while another one already went through gate check.
-	// raise a panic with send to closed channel.
-	mutex.Lock()
-	defer mutex.Unlock()
-	if !*errorChClosed {
-		errorCh <- err
-	}
-}
-
-func softCloseChannel(mutex *sync.Mutex, closed *bool) {
-	mutex.Lock()
-	defer mutex.Unlock()
-	*closed = true
+	errorCh <- err
 }