Async Codegen

1. State Machine 2. structAsyncTaskMethodBuilder publicstructSystem.Runtime.CompilerServices.AsyncTaskMethodBuilder<T> { internalIAsyncStateMachinem_sm; publicvoid Start<TSM>(ref TSM sm) where TSM : IAsyncStateMachine{ //(2) Thread t = Thread.CurrentThread; ExecutionContextSwitcherecs = default(ExecutionContextSwitcher); RuntimeHelpers.PrepareConstrainedRegions(); try{ ExecutionContext.EstablishCopyOnWriteScope(t, false, refecs); sm.MoveNext(); } finally{ ecs.Undo(currentThread); } } } internalstructVoidTaskResult { } structFooAsync_StateMachine : IAsyncStateMachine{ //(1,2,3) privateint _state; publicAsyncTaskMethodBuilder _builder; privatevoidMoveNext() { try{ switch (_state) { TRANSFORMED_BODY } } catch (Exception ex) { _builder.SetException(ex); return; } _builder.SetResult(); } privatevoidSetStateMachine(IAsyncStateMachinesm) { _builder.SetStateMachine(sm); } } asyncTaskFooAsync() { BODY } TaskFooAsync() { varsm = newFooAsync_StateMachine(); sm._state = -1; ... copy params & this if needed into S.M. sm._builder = AsyncTaskMethodBuilder.Create(); sm._builder.Start(refsm); returnsm._builder.Task; } 4. AwaitOnCompleted() 5. structTaskAwaiter 3. AwaiterPattern class AsyncMethodBuilder<T>: publicvoidAwaitOnCompleted<TA, TSM>(ref TA a, ref TSM sm) where TA : INotifyCompletion where TSM : IAsyncStateMachine { if (m_sm == null) { var ignored = this.Task; // allocate a reference m_sm = (IAsyncStateMachine)sm; // box on first await m_sm.SetStateMachine(m_sm); // tie up lose ends } Actioncont; varctx = ExecutionContext.FastCapture(); if (ctx == ExecutionContext.PreAllocatedDefault) { cont = m_defaultContextAction; if (cont == null) cont = newAction(newMoveNextRunner(ctx, this.m_sm).Run); } else{ cont = newMoveNextRunner(ctx, m_sm).Run; } a.OnCompleted(cont); } publicvoidAwaitUnsafeOnCompleted<TA, TSM>(ref TA a, ref TSM sm) //(11) where TA : ICriticalNotifyCompletion where TSM : IAsyncStateMachine { ... exactly as above a.UnsafeOnCompleted(cont); } classTask<T>{ TaskAwaiter<T>GetAwaiter() { returnnewTaskAwaiter<T>(this); } } switch(_state) { case0: goto AFTERAWAIT0; case -1: // fallthrough } Console.WriteLine("a"); TaskAwaitertmp= Task.Delay(100).GetAwaiter(); //(4) if(!tmp.IsCompleted) { //(5) _state = 0; _awaiter = tmp; _builder.AwaitOnCompleted(reftmp, refthis); //(8) return; AFTERAWAIT0: //(9) tmp = (TaskAwaiter)_awaiter; _awaiter = default(TaskAwaiter); } tmp.GetResult(); //(6) tmp= default(TaskAwaiter); //(7) Console.WriteLine("b"); Console.WriteLine("a"); awaitTask.Delay(100); Console.WriteLine("b"); Async Codegen structTaskAwaiter<T> : INotifyCompletion{ privatevarm_delegates = newConcurrentQueue<Action>(); publicboolIsCompleted { get; } public T GetResults() { return...; } publicvoidOnCompleted(Actioncont) { //(12) varsc = SynchronizationContext.Current; m_delegates.Enqueue(delegate {sc.Post(_ => cont(), null); }); } } structTaskAwaiter<T> : ICriticalNotifyCompletion{ privatevarm_delegates = newConcurrentQueue<Action>(); publicboolIsCompleted { get; } public T GetResults() { return...; } publicvoidOnCompleted(Actioncont) { //(10) varec = ExecutionContext.Capture(); varsc = SynchronizationContext.Current; m_delegates.Enqueue(delegate { ExecutionContext.Run(ec, delegate{ sc.Post(_ => cont(), null); }, null); }); } [SecurityCritical] //(11) publicvoidUnsafeOnCompleted(Actioncont) { varsc = SynchronizationContext.Current; m_delegates.Enqueue(delegate { sc.Post(_ => cont(), null); }); } } 6. Lifted local variables int x = 10; awaitt1; int z = 10; { int y = 15; Console.Write(x + y + z); } classFooAsync_StateMachine: privateint _x, _z; MoveNext: this._x = 10; await t1;this._z = 10; { int y = 15; Console.Write(this._x + y + this._x); } Lucian Wischik, VB Language PM These slides describe the IL that’semitted when you use theAsync and Await keywords in VB/C#. 7. Nested “try” blocks await t0; try{ awaitt1; await t2; } finally{ Console.Write("f"); } await t3; staticpublicvoidMoveNext() { bool_fin = true; switch (_state) { case0: gotoAFTERAWAIT0; case 1,2: goto STAGEPOST; case3: gotoAFTERAWAIT3; case-1: /*fallthrough*/ } if(!t0.IsCompleted) {_state=0; _fin=false; return; AFTERAWAIT0: _state=-1;} STAGEPOST: try { switch(_state) { case1: goto AFTERAWAIT1; case2: goto AFTERAWAIT2; case-1: /*fallthrough*/ } if(!t1.IsCompleted) {_state=1; _fin=false; return; AFTERAWAIT1: _state=-1;} if (!t2.IsCompleted) {_state=2; _fin=false; return; AFTERAWAIT2; _state=-1;} } finally{ if (_fin) { Console.Write("f"); } } if(!t3.IsCompleted) {_state=3; _fin=false; return; AFTERAWAIT3; _state=-1;} } 8. Stack spilling [[PUSH a]] ; (int[]) [[PUSH i]] ; (int[], int) DUP0:DUP1 ; (int[], int, int[], int) LDELEM ; (int[], int, int) POP ; (int[], int) TUPLE.NEW ; (Tuple<int[], int>) STFLD this._stack int[] a; inti; a[i].CompareTo(await t); ... RETURN ; do awaiterpattern... AFTERAWAIT0: LDFLD this._stack ; (Tuple<int[], int>) MAKE_STACK ; (int[], int) CALL t.GetResult ; (int[], int, int) MAKE_LVALUES ; (&int, int) CALL CompareTo ; (bool)

1. State Machine structFooAsync_StateMachine : IAsyncStateMachine{ //(1,2,3) privateint _state; publicAsyncTaskMethodBuilder _builder; privatevoidMoveNext() { try{ switch (_state) { TRANSFORMED_BODY } } catch (Exception ex) { _builder.SetException(ex); return; } _builder.SetResult(); } privatevoidSetStateMachine(IAsyncStateMachinesm) { _builder.SetStateMachine(sm); } } asyncTaskFooAsync() { BODY } TaskFooAsync() { varsm = newFooAsync_StateMachine(); sm._state = -1; ... copy params & this if needed into S.M. sm._builder = AsyncTaskMethodBuilder.Create(); sm._builder.Start(refsm); returnsm._builder.Task; } Perf tip: The async tranformationwill add code and will add local variables. If the JIT finds too much code or too many variables, then it degrades drastically. So, you’ll reach this JIT limit sooner with async methods. (1) The compiler implicitly generates a state machine for each async method. Each state corresponds to a piece of code between await statements. The MoveNext() method will advance it to the next state. (2) The state-machine is a struct, for efficiency reasons -- so that on the “fast path” where no awaits were actually needed, then it doesn’t need to be allocated on the heap. (3) The state machine implements System.Runtime.CompilerServices.IAsyncStateMachine. This interface is part of the protocol for making it efficient and secure -- detailed later.

2. structAsyncTaskMethodBuilder publicstructSystem.Runtime.CompilerServices.AsyncTaskMethodBuilder<T> { internalIAsyncStateMachinem_sm; publicvoid Start<TSM>(ref TSM sm) where TSM : IAsyncStateMachine{ //(2) Thread t = Thread.CurrentThread; ExecutionContextSwitcherecs = default(ExecutionContextSwitcher); RuntimeHelpers.PrepareConstrainedRegions(); try{ ExecutionContext.EstablishCopyOnWriteScope(t, false, refecs); sm.MoveNext(); } finally{ ecs.Undo(currentThread); } } } internalstructVoidTaskResult { } (2) We use structs a lot… The pattern "void Start<T>(ref T s) where T:I" lets us use a struct via an interface, but without boxing or copying the struct. This comes at the cost of JIT having to generate a new Start for each TSM. The state-machine is a struct, which contains AsyncTaskMethodBuilder which is a struct. At the first await point, the state machine will be boxed onto the heap, with the builder inside it, and the builder will get a pointer to that boxed state machine. It takes some work to maintain these circular references of structs. async void f() - uses AsyncVoidMethodBuilder async Task<T> f() - uses AsyncTaskMethodBuilder<T> async Task f() - uses AsyncTaskMethodBuilder, which wraps AsyncTaskMethodBuilder<VoidTaskResult>

3. AwaiterPattern switch(_state) { case0: goto AFTERAWAIT0; case -1: // fallthrough } Console.WriteLine("a"); TaskAwaitertmp = Task.Delay(100).GetAwaiter(); //(4) if(!tmp.IsCompleted) { //(5) _state = 0; _awaiter = tmp; _builder.AwaitOnCompleted(reftmp, refthis); //(8) return; AFTERAWAIT0: //(9) tmp = (TaskAwaiter)_awaiter; _awaiter = default(TaskAwaiter); } tmp.GetResult(); //(6) tmp= default(TaskAwaiter); //(7) Console.WriteLine("b"); Console.WriteLine("a"); awaitTask.Delay(100); Console.WriteLine("b"); (4) The await operator is pattern-based. For “await t”, the compiler makes a call to t.GetAwaiter() to get an awaiter. For instance, you could make an extension method “MyAwaiterGetAwaiter(this inti)” to be able to await integers – in which case tmp would have type MyAwaiter rather than TaskAwaiter. WinRT uses this, so you can await an IAsyncInfo. (5) In this case we awaited Task.Delay(100), which won’t have completed yet. But imagine if the task had already completed. Then it would go straight to calling tmp.GetResult(), with no need for heap allocations. (6) The job of tmp.GetResult() is to throw any exceptions from the task (if any), and to return a value (if any). (7) We null-out the temporary variable immediately so it can be garbage-collected. Perf tip: If every “await” is already completed, then it will avoid all heap allocations apart from the final resultant Task. But the overhead is still about 2x that of a non-async method.

4. AwaitOnCompleted() class AsyncMethodBuilder<T>: publicvoidAwaitOnCompleted<TA, TSM>(ref TA a, ref TSM sm) where TA : INotifyCompletion where TSM : IAsyncStateMachine { if (m_sm == null) { var ignored = this.Task; // allocate a reference m_sm = (IAsyncStateMachine)sm; // box on first await m_sm.SetStateMachine(m_sm); // tie up lose ends } Actioncont; varctx = ExecutionContext.FastCapture(); if (ctx == ExecutionContext.PreAllocatedDefault) { cont = m_defaultContextAction; if (cont == null) cont = newAction(newMoveNextRunner(ctx, this.m_sm).Run); } else{ cont = newMoveNextRunner(ctx, m_sm).Run; } a.OnCompleted(cont); } Perf tip: If you avoid modifying ExecutionContext, then it avoids further heap allocations and is faster. (8) AwaitOnCompleted ultimately calls awaiter.OnCompleted(MoveNext)… First, if needed, it boxes the state machine on the heap, including the AsyncMethodBuilderstruct. SetStateMachine gives the builder a back-pointer to the S.M. Its two arguments are both ref parameters so that we don’t need to copy them to pass them. The builder backs up the ExecutionContext before invoking Awaiter.MoveNext. And it allocates & passes a delegate (through class MoveNextRunner) which will restore the ExecutionContext. This is for security. It’s much cheaper if the ExecutionContext is never changed.

5. structTaskAwaiter (12) The user expects the code after "await" to resume on the same SynchronizationContext. It's the job of each awaiter to save the sync. context, and execute its continuation on that context. Thus: the AsyncMethodBuilder takes care of saving and restoring ExecutionContext (since that would be a security loophole otherwise), and the awaiter takes care of saving and restoring SynchronizationContext. classTask<T>{ TaskAwaiter<T>GetAwaiter() { returnnewTaskAwaiter<T>(this); } } structTaskAwaiter<T> : INotifyCompletion{ privatevarm_delegates = newConcurrentQueue<Action>(); publicboolIsCompleted { get; } public T GetResults() { return...; } publicvoidOnCompleted(Actioncont) { //(12) varsc = SynchronizationContext.Current; m_delegates.Enqueue(delegate {sc.Post(_ => cont(), null); }); } } Perf tip: You’ll typically make Structure awaiters, to avoid unnecessary heap allocation. They should be immutable: mutations in OnCompleted will be discarded if the state-machine gets boxed!

5. structTaskAwaiter (10) But if your awaiter is in full-trust assembly which AllowsPartiallyTrustedCallers, then this would be a security hole. Therefore, your OnCompleted method has to save and restore ExecutionContext as well. (11) It would be inefficient if both AsyncMethodBuilder and your awaiter had to save+restoreExecutionContext. And so, if your awaiter implements ICriticalNotifyCompletion, then the compiler will instead emit a call to the _builder method AwaitUnsafeOnCompleted, which saves ExecutionContext as before. This will call the awaitermethod UnsafeOnCompleted. You’re at liberty here to avoid restorignExecutionContext, so long as you mark your method [SecurityCritical] – this prevents partially-trusted callers. classTask<T>{ TaskAwaiter<T>GetAwaiter() { returnnewTaskAwaiter<T>(this); } } structTaskAwaiter<T> : ICriticalNotifyCompletion{ privatevarm_delegates = newConcurrentQueue<Action>(); publicboolIsCompleted { get; } public T GetResults() { return...; } publicvoidOnCompleted(Actioncont) { //(10) varec = ExecutionContext.Capture(); varsc = SynchronizationContext.Current; m_delegates.Enqueue(delegate { ExecutionContext.Run(ec, delegate{ sc.Post(_ => cont(), null); }, null); }); } [SecurityCritical] //(11) publicvoidUnsafeOnCompleted(Actioncont) { varsc = SynchronizationContext.Current; m_delegates.Enqueue(delegate { sc.Post(_ => cont(), null); }); } } Perf tip: If you’re full-trust and allow partially-trusted callers, then use this UnsafeOnCompleted trick. class AsyncMethodBuilder<T>: publicvoidAwaitUnsafeOnCompleted<TA, TSM>(ref TA a, ref TSM sm) //(11) where TA : ICriticalNotifyCompletion where TSM : IAsyncStateMachine { ... exactly as above a.UnsafeOnCompleted(cont); }

3. Awaiter Pattern - dynamic Console.WriteLine("a"); dynamictmp= t.GetAwaiter(); // dynamic method-call if(!tmp.IsCompleted) { // dynamic property-get _state = 0; _awaiter = tmp; varcnc = tmpas ICriticalNotifyCompletion; // CLR cast, not dynamic if (cnc != null) { _builder.AwaitUnsafeOnCompleted(refcnc, refthis); } else { varnc= (INotifyCompletion)tmp; // CLR cast, not dynamic _builder.AwaitOnCompleted(refnc, refthis); } return; AFTERAWAIT0: tmp = _awaiter; _awaiter = null; } tmp.GetResult(); // dynamic method-call tmp= null; Console.WriteLine("b"); Console.WriteLine("a"); await (dynamic)t; Console.WriteLine("b"); Perf tip: If you’re full-trust and allow partially-trusted callers, then use this UnsafeOnCompleted trick. In the case of a late-bound (dynamic) await call, it attempts to cast the awaiter first as ICriticalNotifyCompletion and then as INotifyCompletion. This is done using CLR casts rather than dynamic casts.

6. Lifted local variables int x = 10; awaitt1; int z = 10; { int y = 15; Console.Write(x + y + z); } classFooAsync_StateMachine: privateint _x, _z; MoveNext: this._x = 10; await t1;this._z = 10; { int y = 15; Console.Write(this._x + y + this._x); } Perf tip: Everything runs much faster with local variables. Factor out your compute-bound inner loops into separate methods, away from “await”, so they can run faster. Ideally it’d work like this: “If a local is written before an await, and read after an await, then it must be lifted into the state-machine (either by permanently locating it in the state machine, or by putting there just before an await and restored afterwards, whichever is more efficient).” In practice: If a local’s scope includes an await, then C# will permanently relocate the local into the state machine. This includes “z” in the above code. VB will relocate ALL locals into the state machine.

7. Nested “try” blocks await t0; try{ awaitt1; await t2; } finally{ Console.Write("f"); } await t3; staticpublicvoidMoveNext() { bool_fin = true; switch (_state) { case0: gotoAFTERAWAIT0; case 1,2: goto STAGEPOST; case3: gotoAFTERAWAIT3; case-1: /*fallthrough*/ } if(!t0.IsCompleted) {_state=0; _fin=false; return; AFTERAWAIT0: _state=-1;} STAGEPOST: try { switch(_state) { case1: goto AFTERAWAIT1; case2: goto AFTERAWAIT2; case-1: /*fallthrough*/ } if(!t1.IsCompleted) {_state=1; _fin=false; return; AFTERAWAIT1: _state=-1;} if (!t2.IsCompleted) {_state=2; _fin=false; return; AFTERAWAIT2; _state=-1;} } finally{ if (_fin) { Console.Write("f"); } } if(!t3.IsCompleted) {_state=3; _fin=false; return; AFTERAWAIT3; _state=-1;} } If there are “try” blocks, then the compiler emits additional switch-blocks for each one. It’s illegal to jump straight into a TRY, so the compiler uses a “staging post” label to fall into the try. Also, VB iterators use the same MoveNext trick for iterators: when iterator’s Dispose method is called, it sets a field _isDisposing=true in the state machine, and jumps into the MoveNext. Each switchblock is followed by a test“if (isDisposing) then return false”, which will end up calling each finally block on the way out.

8. Stack spilling [[PUSH a]] ; (int[]) [[PUSH i]] ; (int[], int) DUP0:DUP1 ; (int[], int, int[], int) LDELEM ; (int[], int, int) POP ; (int[], int) TUPLE.NEW ; (Tuple<int[], int>) STFLD this._stack int[] a; inti; a[i].CompareTo(await t); ... RETURN ; do awaiterpattern... AFTERAWAIT0: LDFLD this._stack ; (Tuple<int[], int>) MAKE_STACK ; (int[], int) CALL t.GetResult ; (int[], int, int) MAKE_LVALUES ; (&int, int) CALL CompareTo ; (bool) Perf tip: You can typically do a better job of stack-spilling than the compiler. Avoid “await” that’s nested deep inside expressions. Use it mostly in statements where it’s the first expression to be evaluated, e.g. var x = await t; using (await t) {…} foreach (vari in await t) {…} Normally, the compiler will evaluate each sub-expression in turn (pushing it onto the stack), then it will call the desired operation. If there is an “await” which doesn’t take the fast-path, then it’ll have to save the stack into the state machine before returning. That’s because the stack has to be empty when at each RETURN operaton. The compiler saves the stack into a Tuple of the appropriate type, stored in a state-machine field “_stack” of type Object. But if there were any managed addresses on the stack, then this isn’t allowed. In the above code, it would normally want to evaluate a[i] as a managed address (lvalue) before calling the CompareTo method; all struct methods are similarly invoked on managed addresses in case the method mutates the struct. In such cases the compiler has to avoid pushing the managed address onto the stack in the first place. Instead it pushes the constituent parts (in this case “a” and “i”). These can be saved into the Tuple okay. Later on, after the “await” has finished and immediately prior to the call to “CompareTo”, it reconstitutes those constituent parts into the address. Note: it still had to issue a dummy LDELEM call in advance, just to shake out any ArrayIndexExceptions that might arise. Managed addresses can come only from “LOCAL”, “rvalue.FIELD”, “rvalue[rvalue]” and from ByRef parameters (disallowed in async methods). Managed addresses are only ever consumed by “lvalue.M(…)”, “lvalue=rvalue”, “lvalue+=rvalue”, “lvalue++”, and passing an lvalueByRef.

Async Codegen

Async Codegen

Presentation Transcript

AJAX (Async JavaScript and XML) and Java Applets

Trains, Hotels, and Async

Async IO, Non Blocking IO, Blocking IO and Multithreading

C# and Visual Basic Future: Async Made Simple

Async JavaScript at Netflix

MVC4 WebAPI VS2012 async / await SignalR

{ async patterns }

Async Clinic

Async best practices for C#/VB

Async /Await

“Parallel Programming with Async and Await ”

Welcome to Async 2007 Marly Roncken Peter Beerel

Async XDS.b

www-verimag.imag.fr/~async/BIP/bip.html

Async Workgroup Update

Async. modem glossary

Network Applications: Async Servers and Operational Analysis

Async XDS.b

Think Async

Async JavaScript at Netflix