diff --git a/examples/slu4-minimal-64/slu4-minimal-64.yaml b/examples/slu4-minimal-64/slu4-minimal-64.yaml
index 00ae65d..b9d4b80 100644
--- a/examples/slu4-minimal-64/slu4-minimal-64.yaml
+++ b/examples/slu4-minimal-64/slu4-minimal-64.yaml
@@ -1332,6 +1332,17 @@ macros:
         - "pls"
         - "pls"
         - "pls"
+  cpyaa:
+    - operands:
+        count: 2
+        operand_sets:
+          list:
+            - absolute_address
+            - absolute_address
+      instructions:
+        # Copies from source (arg 1) to destination (arg 0)
+        - "lda @ARG(1)+0"
+        - "sta @ARG(0)+0"
   cpy2as:
     - operands:
         count: 2
@@ -1371,6 +1382,19 @@ macros:
         - "sta @ARG(0)+0"
         - "ldi BYTE1(@ARG(1))"
         - "sta @ARG(0)+1"
+  cpy2aa:
+    - operands:
+        count: 2
+        operand_sets:
+          list:
+            - absolute_address
+            - absolute_address
+      instructions:
+        # Copies from source (arg 1) to destination (arg 0)
+        - "lda @ARG(1)+0"
+        - "sta @ARG(0)+0"
+        - "lda @ARG(1)+1"
+        - "sta @ARG(0)+1"
   cpy4as:
     - operands:
         count: 2
diff --git a/examples/slu4-minimal-64/software/math32lib.min64 b/examples/slu4-minimal-64/software/math32lib.min64
index 8e192a6..86f16d4 100644
--- a/examples/slu4-minimal-64/software/math32lib.min64
+++ b/examples/slu4-minimal-64/software/math32lib.min64
@@ -106,8 +106,6 @@ _lsr64:
     rrb _multiply_working_mem+0
     rts
 _asr64:
-    ; save MSB to do a arithmetic shift right
-    lda _multiply_working_mem+7 ani %10000000 sta .sign_bit
     lrb _multiply_working_mem+7
     rrb _multiply_working_mem+6
     rrb _multiply_working_mem+5
@@ -116,7 +114,9 @@ _asr64:
     rrb _multiply_working_mem+2
     rrb _multiply_working_mem+1
     rrb _multiply_working_mem+0
-    lda .sign_bit adb _multiply_working_mem+7
+    ; check if a sign bit needs to be maintained
+    lda _multiply_working_mem+7 ani %01000000 lsl orb _multiply_working_mem+7
+.done:
     rts
 .sign_bit: .byte 0
 
@@ -310,22 +310,37 @@ subtract32:
 ;
 ;   Return Value
 ;       sp+4 - replace value with shifted value
-
+;       * Does not porperly set carry flag
+;
 lsr32n:
     ; first, save X register
     txa phs
     ; set up local variables
     cpy4as .local_val,4+1
     lds 3+1 tax
+
+    ; Now see if we can save iterations by shortcutting the right shift
+.iteration_saver:
+    ldi 7 cpx bgt .lsr_loop ; if the current bit shift count is <=7, then start doing loop
+    ; we need to shift 8 or more bits. Fast track a 8-bit shift by dropping LSB
+    cpyaa .local_val+0,.local_val+1
+    cpyaa .local_val+1,.local_val+2
+    cpyaa .local_val+2,.local_val+3
+    ldi 0 sta .local_val+3
+    ; decrement the shift counter by 8
+    txa sbi 8 tax
+    ; try again
+    jpa .iteration_saver
 .lsr_loop:
+    ; check the iteration counter is > 0, otherwise do a bit shift and decrement counter.
+    ldi 0 cpx beq .done
     ; right shifts start from the MSB
     lrb .local_val+3
     rrb .local_val+2
     rrb .local_val+1
     rrb .local_val+0
     dex
-    bne .lsr_loop           ; if bit count isn't a zero
-
+    jpa .lsr_loop
 .done:
     cpy4sa 4+1,.local_val
     ; restore X register before returning
@@ -344,24 +359,44 @@ lsr32n:
 ;
 ;   Return Value
 ;       sp+4 - replace value with shifted value
-
+;       * Does not porperly set carry flag
+;
 asr32n:
     ; first, save X register
     txa phs
     ; set up local variables
     cpy4as .local_val,4+1
     lds 3+1 tax
+    ; save sign byte for the iteration saver
+    lda .local_val+3 ani %10000000 cpi 0 beq .zero_sign_byte
+    ldi $FF sta .sign_byte
+    jpa .iteration_saver
+.zero_sign_byte:
+    ldi 0 sta .sign_byte
+    ; Now see if we can save iterations by shortcutting the right shift
+.iteration_saver:
+    ldi 7 cpx bgt .asr_loop ; if the current bit shift count is <=7, then start doing loop
+    ; we need to shift 8 or more bits. Fast track a 8-bit shift by dropping LSB
+    cpyaa .local_val+0,.local_val+1
+    cpyaa .local_val+1,.local_val+2
+    cpyaa .local_val+2,.local_val+3
+    cpyaa .local_val+3,.sign_byte
+    ; decrement the shift counter by 8
+    txa sbi 8 tax
+    ; try again
+    jpa .iteration_saver
 .asr_loop:
+    ; check the iteration counter is > 0, otherwise do a bit shift and decrement counter.
+    ldi 0 cpx beq .done
     ; right shifts start from the MSB
     lrb .local_val+3
     rrb .local_val+2
     rrb .local_val+1
     rrb .local_val+0
-    lda .local_val+3 ani %01000000 cpi 0 beq .continue_asl
-    ldi %10000000 orb .local_val+3 sta .local_val+3
+    lda .local_val+3 ani %01000000 lsl orb .local_val+3
 .continue_asl:
     dex
-    bne .asr_loop           ; if bit count isn't a zero
+    jpa .asr_loop
 
 .done:
     cpy4sa 4+1,.local_val
@@ -370,3 +405,5 @@ asr32n:
     rts
 .local_val:
     .4byte 0
+.sign_byte:
+    .byte 0