From 295ff46298a885c8bb8152f06d512686814a7f92 Mon Sep 17 00:00:00 2001
From: Michael Kamprath <michael@kamprath.net>
Date: Sun, 6 Aug 2023 20:38:59 -0700
Subject: [PATCH] added 16-bit version of mandelbrot for minimal 64

---
 examples/slu4-minimal-64/README.md            |  10 +
 examples/slu4-minimal-64/slu4-minimal-64.yaml | 106 ++++-
 .../software/mandelbrot16.min64               | 183 +++++++++
 .../{mandelbrot.min64 => mandelbrot32.min64}  |   0
 .../slu4-minimal-64/software/math16lib.min64  | 374 ++++++++++++++++++
 .../slu4-minimal-64/software/math32lib.min64  |  88 ++---
 6 files changed, 713 insertions(+), 48 deletions(-)
 create mode 100644 examples/slu4-minimal-64/software/mandelbrot16.min64
 rename examples/slu4-minimal-64/software/{mandelbrot.min64 => mandelbrot32.min64} (100%)
 create mode 100644 examples/slu4-minimal-64/software/math16lib.min64

diff --git a/examples/slu4-minimal-64/README.md b/examples/slu4-minimal-64/README.md
index 2b271a4..2f67db4 100644
--- a/examples/slu4-minimal-64/README.md
+++ b/examples/slu4-minimal-64/README.md
@@ -42,8 +42,13 @@ The following instruction macros have been added in the ISA configuration file f
 | `phs4s` | stack offset | - | Push onto stack 4 byte value currently found at indicated stack offset |
 | `pls2` | - | - | Pull 2 bytes from stack. Last byte pulled will be in A register. |
 | `pls4` | - | - | Pull 4 bytes from stack. Last byte pulled will be in A register. |
+| `cpyaa` | absolute address | absolute address | Copies a single byte value from one absolute address (second operand) to another (first operand).
 | `cpy2as` | absolute address | stack offset | Copy 2 bytes of data sourced from indicated stack offset to memory starting at indicated absolute address. Convert from stack big endian ordering to RAM little endian ordering. |
 | `cpy2sa` | stack offset | absolute address | Copy 2 bytes of data sourced from absolute address to stack at indicated offset. Convert from RAM little endian to stack big endian ordering ordering. |
+| `cpy2ai` | absolute address | immediate | Copy 2 bytes of immediate value to memory starting at indicated absolute address. Preserves endian ordering. |
+| `cpy2si` | stack offset | immediate | Copy 2 bytes of immediate value to stack at indicated offset. Convert from RAM little endian to stack big endian ordering ordering. |
+| `cpy2ss` | stack offset | stack offset | Copy 2 bytes of data from stack starting at indicated offset (2nd operand) to another location in stack starting at indicated offset (1rst operand). Byte ordering is preserved. |
+| `cpy2aa` | absolute address | absolute address | Copy 2 bytes starting at source address (secord operand) to destination address (first operand) |
 | `cpy4as` | absolute address | stack offset | Copy 4 bytes of data sourced from indicated stack offset to memory starting at indicated absolute address. Convert from stack big endian ordering to RAM little endian ordering. |
 | `cpy4sa` | stack offset | absolute address | Copy 4 bytes of data sourced from absolute address to stack at indicated offset. Convert from RAM little endian to stack big endian ordering ordering. |
 | `cpy4ai` | absolute address | immediate | Copy 4 bytes of immediate value to memory starting at indicated absolute address. Preserves endian ordering. |
@@ -52,6 +57,11 @@ The following instruction macros have been added in the ISA configuration file f
 | `cpy4aa` | absolute address | absolute address | Copy 4 bytes starting at source address (secord operand) to destination address (first operand) |
 | `inc16a` | absolute address | - | Increment the two byte integer value found at the absolute address |
 | `inc32a` | absolute address | - | Increment the two byte integer value found at the absolute address |
+| `twos2s` | stack offset | - | Calculates the two's complement of a 2 byte value at a given offset into the stack and updates it in place. |
+| `twos2a` | absolute address | - | Calculates the two's complement of a 2 byte value at indicated absolute address and updates it in place. |
+| `twos4s` | stack offset | - | Calculates the two's complement of a 4 byte value at a given offset into the stack and updates it in place. |
+| `twos4a` | absolute address | - | Calculates the two's complement of a 4 byte value at indicated absolute address and updates it in place. |
+
 
 The operand descriptions use the definitions provided by documentation for Minimal 64. You should assume the accumulator (register `A`) is not preserved across any of these macros.
 
diff --git a/examples/slu4-minimal-64/slu4-minimal-64.yaml b/examples/slu4-minimal-64/slu4-minimal-64.yaml
index b9d4b80..e9aa533 100644
--- a/examples/slu4-minimal-64/slu4-minimal-64.yaml
+++ b/examples/slu4-minimal-64/slu4-minimal-64.yaml
@@ -1382,6 +1382,31 @@ macros:
         - "sta @ARG(0)+0"
         - "ldi BYTE1(@ARG(1))"
         - "sta @ARG(0)+1"
+  cpy2si:
+    - operands:
+        count: 2
+        operand_sets:
+          list:
+            - immediate_8bit
+            - immediate_16bit
+      instructions:
+        # stack is big endian
+        - "ldi BYTE0(@ARG(1))"
+        - "sts @ARG(0)+1"
+        - "ldi BYTE1(@ARG(1))"
+        - "sts @ARG(0)+0"
+  cpy2ss:
+    - operands:
+        count: 2
+        operand_sets:
+          list:
+            - immediate_8bit
+            - immediate_8bit
+      instructions:
+        - "lds @ARG(1)+0"
+        - "sts @ARG(0)+0"
+        - "lds @ARG(1)+1"
+        - "sts @ARG(0)+1"
   cpy2aa:
     - operands:
         count: 2
@@ -1454,14 +1479,15 @@ macros:
             - immediate_8bit
             - immediate_32bit
       instructions:
+        # stack is big endian
         - "ldi BYTE0(@ARG(1))"
-        - "sts @ARG(0)+0"
+        - "sts @ARG(0)+3"
         - "ldi BYTE1(@ARG(1))"
-        - "sts @ARG(0)+1"
-        - "ldi BYTE2(@ARG(1))"
         - "sts @ARG(0)+2"
+        - "ldi BYTE2(@ARG(1))"
+        - "sts @ARG(0)+1"
         - "ldi BYTE3(@ARG(1))"
-        - "sts @ARG(0)+3"
+        - "sts @ARG(0)+0"
   cpy4ss:
     - operands:
         count: 2
@@ -1519,3 +1545,75 @@ macros:
         - "acb @ARG(0)+2"
         - "ldi 0"
         - "acb @ARG(0)+3"
+
+  twos2s:
+    # calculates the two's complement of the 2 byte value at offset in stack
+    - operands:
+        count: 1
+        operand_sets:
+          list:
+            - immediate_8bit
+      instructions:
+          - "lds @ARG(0)+1"
+          - "not"
+          - "inc"
+          - "sts @ARG(0)+1"
+          - "lds @ARG(0)+0"
+          - "not"
+          - "aci 0"
+          - "sts @ARG(0)+0"
+  twos2a:
+    # calculates the two's complement of the 2 byte value at absolute address
+    - operands:
+        count: 1
+        operand_sets:
+          list:
+            - absolute_address
+      instructions:
+          - "nob @ARG(0)+0"
+          - "inb @ARG(0)+0"
+          - "nob @ARG(0)+1"
+          - "acb @ARG(0)+1"
+  twos4s:
+    # calculates the two's complement of the 4 byte value at offset in stack
+    - operands:
+        count: 1
+        operand_sets:
+          list:
+            - immediate_8bit
+      instructions:
+          - "lds @ARG(0)+3"
+          - "not"
+          - "inc"
+          - "sts @ARG(0)+3"
+          - "lds @ARG(0)+2"
+          - "not"
+          - "aci 0"
+          - "sts @ARG(0)+2"
+          - "lds @ARG(0)+1"
+          - "not"
+          - "aci 0"
+          - "sts @ARG(0)+1"
+          - "lds @ARG(0)+0"
+          - "not"
+          - "aci 0"
+          - "sts @ARG(0)+0"
+  twos4a:
+    # calculates the two's complement of the 4 byte value at absolute address
+    - operands:
+        count: 1
+        operand_sets:
+          list:
+            - absolute_address
+      instructions:
+          - "nob @ARG(0)+0"
+          - "inb @ARG(0)+0"
+          - "nob @ARG(0)+1"
+          - "ldi 0"
+          - "acb @ARG(0)+1"
+          - "nob @ARG(0)+2"
+          - "ldi 0"
+          - "acb @ARG(0)+2"
+          - "nob @ARG(0)+3"
+          - "ldi 0"
+          - "acb @ARG(0)+3"
diff --git a/examples/slu4-minimal-64/software/mandelbrot16.min64 b/examples/slu4-minimal-64/software/mandelbrot16.min64
new file mode 100644
index 0000000..7acd792
--- /dev/null
+++ b/examples/slu4-minimal-64/software/mandelbrot16.min64
@@ -0,0 +1,183 @@
+; Mandelbrot for the Minimal 64 Home Computer
+;
+; Approach is to used fixed point math to only use integer operations. A detailed
+; explanation of this approach can be found here:
+;
+;       https://github.com/rahra/intfract
+;
+#require "slu4-min64-asm >= 1.2.0"
+
+IMAGE_X_PIXELS = 400
+IMAGE_Y_PIXELS = 240
+
+SCALE_BITS = 9
+SCALE_FACTOR = (1 << SCALE_BITS)
+
+MANDELBROT_START_X = -2*SCALE_FACTOR
+MANDELBROT_END_X = 1*SCALE_FACTOR
+MANDELBROT_STEP_X = (MANDELBROT_END_X - MANDELBROT_START_X)/IMAGE_X_PIXELS
+
+MANDELBROT_START_Y = -1*SCALE_FACTOR
+MANDELBROT_END_Y = 1*SCALE_FACTOR
+MANDELBROT_STEP_Y = (MANDELBROT_END_Y - MANDELBROT_START_Y)/IMAGE_Y_PIXELS
+
+MAX_ITERATIONS = $FF
+
+
+.org $8000
+init:
+    spinit              ; init stack
+    jps _Clear
+    cpy2ai cur_pixel_x,0
+    cpy2ai cur_pixel_y,0
+
+.pixel_loop_y:
+    ; calculate scaled y0
+    phs2a cur_pixel_y
+    phs2i MANDELBROT_STEP_Y
+    jps multiply_int16          ; results are 32 bit
+    pls2
+    phs2i MANDELBROT_START_Y
+    jps add16
+    cpy2as scaled_y0,1          ; fetch results
+    pls2 pls2                   ; discard upper 32 bits of multiplications
+
+.pixel_loop_x:
+    ; calcualted scaled x0
+    phs2a cur_pixel_x
+    phs2i MANDELBROT_STEP_X
+    jps multiply_int16          ; results are 32 bit
+    pls2
+    phs2i MANDELBROT_START_X
+    jps add16
+    cpy2as scaled_x0,1          ; fetch results
+    pls2 pls2                   ;
+
+    ; check if in mandelbrot set
+.init_mandelbrot:
+    ; start interations
+    ldi 0 sta iteration_count
+    ; initialize zx and zy
+    cpy2aa zx,scaled_x0
+    cpy2aa zy,scaled_y0
+
+    ; push pixel coordinates on stack
+    phsa cur_pixel_x+0
+    phsa cur_pixel_x+1
+    phsa cur_pixel_y+0
+    jps _SetPixel
+    pls pls pls
+
+.mandelbrot_loop:
+    ; find zx*zx + zy*zy
+    phs2a zx
+    phs2a zx
+    jps multiply_int16
+    phsi SCALE_BITS jps asr32n pls  ; rescale
+    pls2
+    cpy2as zx_squared,1
+    pls2
+
+    phs2a zy
+    phs2a zy
+    jps multiply_int16
+    phsi SCALE_BITS jps asr32n pls  ; rescale
+    pls2
+    cpy2as zy_squared,1
+    phs2a zx_squared
+    jps add16
+    cpy2as temp_int16,1
+    pls2 pls2
+
+    ; check if value is greater than NOT_MANDELBROT_THRESHOLD
+    phs2a temp_int16        ; left value
+    phs2i 4*SCALE_FACTOR    ; right value
+    jps compare_uint16
+    pls2 pls2
+    bgt .not_in_mandelbrot
+
+    ; increment counter and check count
+    inb iteration_count
+    lda iteration_count cpi MAX_ITERATIONS
+    beq .in_mandelbrot          ; if we are at max iterations, point is in set
+
+    ; set up for next mandelbrot iteration
+
+    ; zy = 2*zx*zy + scaled_y0
+    phs2a zx
+    phs2a zy
+    jps multiply_int16
+    phsi (SCALE_BITS-1) jps asr32n pls  ; rescale
+    pls2                                ; remove top 4 bytes
+    ; stack now contains 2*zx*zy
+    phs2a scaled_y0
+    jps add16
+    cpy2as zy,1
+    pls2 pls2
+
+    ; zx = zx*zx - zy*zy + scaled_x0, but store in temp for now
+    phs2a zy_squared        ; Y value
+    phs2a zx_squared        ; X value
+    jps subtract16          ; X-Y
+    phs2a scaled_x0
+    jps add16
+    cpy2as zx,1             ; the new zx value
+    pls2 pls2 pls2
+
+    ; next loop
+    jpa .mandelbrot_loop
+
+.in_mandelbrot:
+    ; push pixel coordinates on stack
+    phsa cur_pixel_x+0
+    phsa cur_pixel_x+1
+    phsa cur_pixel_y+0
+    jps _ClrPixel
+    pls pls pls                 ; remove pixel coordinates from stack
+.not_in_mandelbrot:
+.mandelbot_pixel_done:
+
+.pixel_loop_x_end:
+    ; next x pixel
+    inc16a cur_pixel_x
+    ; check to see if we are done with current x row
+    phs2i IMAGE_X_PIXELS
+    phs2a cur_pixel_x
+    jps compare_uint16
+    pls2 pls2
+    bne .pixel_loop_x
+    cpy2ai cur_pixel_x,0
+
+.pixel_loop_y_end:
+    ; next y pixel
+    inc16a cur_pixel_y
+    ; check to see if we are done overall
+    phs2i IMAGE_Y_PIXELS
+    phs2a cur_pixel_y
+    jps compare_uint16
+    pls2 pls2
+    bne .pixel_loop_y
+
+.looping_done:
+    ldi 0 sta _XPos ldi 29 sta _YPos
+    jps _ScrollUp
+    jpa _Prompt
+
+;
+; Variables
+;
+
+cur_pixel_x:        .2byte 0
+cur_pixel_y:        .2byte 0
+scaled_x0:          .2byte 0
+scaled_y0:          .2byte 0
+zx:                 .2byte 0
+zy:                 .2byte 0
+zx_squared:         .2byte 0
+zy_squared:         .2byte 0
+temp_int16:         .2byte 0
+iteration_count:    .byte 0
+
+#include "math16lib.min64"
+#include "math32lib.min64"
+#include "stringlib.min64"
diff --git a/examples/slu4-minimal-64/software/mandelbrot.min64 b/examples/slu4-minimal-64/software/mandelbrot32.min64
similarity index 100%
rename from examples/slu4-minimal-64/software/mandelbrot.min64
rename to examples/slu4-minimal-64/software/mandelbrot32.min64
diff --git a/examples/slu4-minimal-64/software/math16lib.min64 b/examples/slu4-minimal-64/software/math16lib.min64
new file mode 100644
index 0000000..1509111
--- /dev/null
+++ b/examples/slu4-minimal-64/software/math16lib.min64
@@ -0,0 +1,374 @@
+#require "slu4-min64-asm >= 1.2.0"
+
+; compare_uint16
+;   Compares two unsigned 16-bit values to determine equality
+;       X ? Y
+;
+;   Arguments
+;       sp+3 : right Y value (2 bytes)
+;       sp+5 : left X value (2 bytes)
+;
+;   Returns
+;       flags will be set per comparison
+;
+compare_uint16:
+    ; first check high bytes, then others in sequence
+    ; values on stack are stored big endian
+    lds (3+0) sta .rval lds (5+0) cpa .rval bne .done
+    lds (3+1) sta .rval lds (5+1) cpa .rval
+.done:
+    rts
+.rval: .byte 0
+
+; compare_int16
+;   Compares two signed 16-bit values to determine equality
+;       X ? Y
+;
+;   Arguments
+;       sp+3 : right Y value (2 bytes)
+;       sp+5 : left X value (2 bytes)
+;
+;   Returns
+;       flags will be set per comparison
+;
+compare_int16:
+    ; first check signs. if LHS is negative, check RHS sign
+    lds (5+0) ani %1000000 cpi 0 beq .lhs_positive
+    ; LHS is negative, check RHS
+    lds (3+0) ani %1000000 cpi 0 beq .lhs_negative_rhs_positive
+.lhs_negative_rhs_negative:
+    ; swap LHS and RHS in camparisons so flag sense will be correct
+    lds (5+0) sta .rval lds (3+0) cpa .rval bne .done
+    lds (5+1) sta .rval lds (3+1) cpa .rval
+    jpa .done
+.lhs_negative_rhs_positive:
+    ; LHS is less than RHS. do comparison of signed bits to get right flags
+    ldi 0 cpi 1
+    jpa .done
+.lhs_positive:
+    ; LHS is positive, check RHS
+    lds (3+0) ani %1000000 cpi 0 beq .lhs_positive_rhs_positive
+.lhs_positive_rhs_negative:
+    ; LHS is greater than RHS. do comparison of signed bits to get right flags
+    ldi 1 cpi 0
+    jpa .done
+.lhs_positive_rhs_positive:
+    lds (3+0) sta .rval lds (5+0) cpa .rval bne .done
+    lds (3+1) sta .rval lds (5+1) cpa .rval
+.done:
+    rts
+.rval: .byte 0
+
+
+; multiply_uint16
+;   multiply unsigned 2 byte values X*Y, producing an 4 byte unsigned results
+;
+; multiply_int32
+;   multiply signed 2 byte values X*Y, producing an 4 byte signed results
+;
+; Arguments
+;   sp+3 - value X (multiplier) (2 bytes)
+;   sp+5 - value Y (multiplicand) (2 bytes)
+;
+; Return Value
+;   sp+3 - results (4 bytes)
+;
+
+multiply_uint16:
+    ; return is always positive
+    ldi 0 sta _multiply_sign_byte
+    jpa _multiply
+
+multiply_int16:
+    ; determine if result is going to be negative
+    lds 3 ani %10000000 sta _multiply_sign_byte cpi 0 beq .check_multiplicand
+    ; negate mutiplier (stack is big endian)
+    twos2s 3
+.check_multiplicand:
+    lds 5 ani %10000000 xra _multiply_sign_byte sta _multiply_sign_byte
+    lds 5 ani %10000000 cpi 0 beq .done
+    ; negate mutiplicand (stack is big endian)
+    twos2s 5
+.done:
+    jpa _multiply
+_multiply_sign_byte:
+    .byte 0
+_multiply:
+    ; set counter for 32 bits
+    ldi 16 sta .counter
+    ; set up 8 byte results memory block
+    cpy2ai .multiply_working_mem+2,0     ; high word inialized to 0
+    cpy2as .multiply_working_mem,3       ; multiplier in low word
+    ; ; check to see if multiplier is negative
+    lda .multiply_working_mem+1 ani %10000000 cpi 0 beq .mult_loop
+    cpy2ai .multiply_working_mem+2,$FFFF ; set high word to negative
+.mult_loop:
+    ; check to see if LSb of working memory is 1
+    lda .multiply_working_mem+0 lsr bcc .continue
+    ; add high word of results to multiplicand
+    phs2s 5
+    phs2a .multiply_working_mem+2
+    jps add16
+    cpy2as .multiply_working_mem+2,1
+    pls2
+    pls2
+.continue:
+    ; shift results right one.
+    lrb .multiply_working_mem+3
+    rrb .multiply_working_mem+2
+    rrb .multiply_working_mem+1
+    rrb .multiply_working_mem+0
+    ; decrement counter (placing it in A) and stop if 0
+    deb .counter cpi 0 bne .mult_loop
+.set_sign:
+    ; check to see if result is negative:
+    lda _multiply_sign_byte cpi 0 beq .positive_results
+    lda .multiply_working_mem+0 not inc sts 3+3
+    lda .multiply_working_mem+1 not aci 0 sts 3+2
+    lda .multiply_working_mem+2 not aci 0 sts 3+1
+    lda .multiply_working_mem+3 not aci 0 sts 3+0
+    rts
+.positive_results:
+    ; the entire working memory is the 32-bit results
+    cpy4sa 3,.multiply_working_mem
+    rts
+.counter: .byte 0
+.multiply_working_mem: .zero 4
+
+; divide16
+;   Divides X by Y (note, unsigned only)
+;
+;   Arguments:
+;       sp+3 : value X dividend (2 bytes)
+;       sp+5 : value Y divisor (2 bytes)
+;
+;   Return Value:
+;       sp+3 : the quotient (replaces X)
+;       sp+5 : the remainder (replaces Y)
+;
+divide16:
+    ; first check values for 0
+    phs2i 0
+    phs2s (5+2)
+    jps compare_uint16
+    pls2
+    beq .divide_by_zero
+    phs2s (3+2)
+    jps compare_uint16
+    pls2
+    pls2
+    beq .return_zero
+    ; check if divisor > dividend
+    phs2s (5+0)
+    phs2s (3+2)
+    jps compare_uint16
+    pls2
+    pls2
+    bgt .divisor_too_large
+.start_division:
+    ; set up working stack:
+    ;   little endian
+    ;   0 : low word (2 bytes)  --> becomes quotient
+    ;   2 : high word (2 bytes) --> becomes remainder
+    ldi 0 sta .carry_bit        ; init carry bit
+    cpy2ai .working_mem+2, 0    ; init high word
+    cpy2as .working_mem+0, 3    ; init low word with dividend
+    ldi 16 sta .counter         ; init loop counter
+.div_loop:
+    ; shift working memory and add carry bit to the right side
+    jps .div_lsl32
+    lda .working_mem+0
+    ada .carry_bit
+    sta .working_mem+0
+    ldi 0 sta .carry_bit
+    ; determine if we can do subtraction
+    phs2s 5                ; divisor (left)
+    phs2a .working_mem+2   ; working value high word (right)
+    jps compare_uint16
+    bgt .div_loop_continue
+.div_loop_subtraction:
+    ; working value is equal to or larger than divsior
+    ; do the subtraction
+    jps subtract16
+    ; save subtraction results to high word and set carry bit
+    cpy2as .working_mem+2, 1
+    ldi 1 sta .carry_bit
+.div_loop_continue:
+    ; clear stack
+    pls2
+    pls2
+    ; decrement counter and check for 0
+    deb .counter
+    lda .counter cpi 0 bne .div_loop
+.division_done:
+    ; at this point we have the remainder in the high word, save it
+    cpy2sa 5,.working_mem+2
+    ; and then we left shift one more time to get the quotient
+    jps .div_lsl32
+    lda .working_mem+0
+    ada .carry_bit
+    sta .working_mem+0
+    cpy2sa 3,.working_mem+0
+    rts
+.divisor_too_large:
+    ; quotient = 0, remander = dividend
+    cpy2ss 5, 3
+    cpy2si 3, 0
+    rts
+.divide_by_zero:
+    ; for now, just return 0
+    pls2
+    pls
+    cpy2si 5, 0
+.return_zero:
+    cpy2si 3, 0
+    rts
+.working_mem: .zero 4
+.carry_bit: .byte 0
+.counter: .byte 0
+; .div_lsl64
+;
+;   local method for shifting .working_mem left 1 bit
+.div_lsl32:
+    llb .working_mem+0
+    rlb .working_mem+1
+    rlb .working_mem+2
+    rlb .working_mem+3
+    rts
+
+
+
+; add16
+;   adds Y value to X (X+Y)
+;
+;   Arguments
+;       sp+3 - value X, 2 byte value, signed or unsigned
+;       sp+5 - value Y, 2 byte value, signed or unsigned
+;
+;   Return Value
+;       sp+3 - replace the original 2 byte value with the sum
+;
+add16:
+    ; stack is big endian, save locally little endian
+    cpy2as .yval, 5
+    ; start addition with LSB, Remember, stack is big endian
+    lds 3+1 ada .yval+0 sts 3+1
+    lds 3+0 aca .yval+1 sts 3+0
+    ; end return
+    rts
+.yval: .2byte 0
+
+
+; subtract16
+;   subtracts Y value from X (X-Y)
+;
+;   Arguments
+;       sp+3 - value X, 2 byte value
+;       sp+5 - value Y, 2 byte value
+;
+;   Return Value
+;       sp+3 - replace the original 4 byte value with the difference
+;
+;
+subtract16:
+    ; stack is big endian, save Y locally little endian
+    cpy2as .yval, 5
+    ; star subtractiosn with LSB
+    lds 3+1 sba .yval+0 sts 3+1
+    lds 3+0 sca .yval+1 sts 3+0
+    ; end return
+    rts
+.yval: .2byte 0
+
+; lsr16n
+;   logical shift right N bits for 32 bit values
+;
+;   Arguments
+;       sp+3 - count of bits to shift right. 1 byte value, masked to %00001111, if great return zero
+;       sp+4 - value to be shifted right, 2 byte value
+;
+;   Return Value
+;       sp+4 - replace value with shifted value
+;       * Does not porperly set carry flag
+;
+lsr16n:
+    ; first, save X register
+    txa phs
+    ; set up local variables
+    cpy2as .local_val,4+1
+    lds 3+1 tax
+
+    ; Now see if we can save iterations by shortcutting the right shift
+.iteration_saver:
+    ldi 7 cpx bgt .lsr_loop ; if the current bit shift count is <=7, then start doing loop
+    ; we need to shift 8 or more bits. Fast track a 8-bit shift by dropping LSB
+    cpyaa .local_val+0,.local_val+1
+    ldi 0 sta .local_val+1
+    ; decrement the shift counter by 8
+    txa sbi 8 tax
+.lsr_loop:
+    ; check the iteration counter is > 0, otherwise do a bit shift and decrement counter.
+    ldi 0 cpx beq .done
+    ; right shifts start from the MSB
+    lrb .local_val+1
+    rrb .local_val+0
+    dex
+    jpa .lsr_loop
+.done:
+    cpy2sa 4+1,.local_val
+    ; restore X register before returning
+    pls tax
+    rts
+.local_val:
+    .2byte 0
+
+
+; asr16n
+;   arithmetic shift right N bits for 32 bit values
+;
+;   Arguments
+;       sp+3 - count of bits to shift right. 1 byte value, masked to %00001111, if great return zero
+;       sp+4 - value to be shifted right, 2 byte value
+;
+;   Return Value
+;       sp+4 - replace value with shifted value
+;       * Does not porperly set carry flag
+;
+asr16n:
+    ; first, save X register
+    txa phs
+    ; set up local variables
+    cpy2as .local_val,4+1
+    lds 3+1 tax
+    ; Now see if we can save iterations by shortcutting the right shift
+.iteration_saver:
+    ldi 7 cpx bgt .asr_loop ; if the current bit shift count is <=7, then start doing loop
+    ; decrement the shift counter by 8
+    txa sbi 8 tax
+    ; we need to shift 8 or more bits. Fast track a 8-bit shift by dropping LSB
+    cpyaa .local_val+0,.local_val+1
+    lda .local_val+1 ani %10000000 cpi 0 beq .zero_sign_byte
+    ldi $FF sta .local_val+1
+    jpa .asr_loop
+.zero_sign_byte:
+    ldi 0 sta .local_val+1
+.asr_loop:
+    ; check the iteration counter is > 0, otherwise do a bit shift and decrement counter.
+    ldi 0 cpx beq .done
+    ; right shifts start from the MSB
+    lrb .local_val+1
+    rrb .local_val+0
+    lda .local_val+1 ani %01000000 lsl orb .local_val+1
+.continue_asl:
+    dex
+    jpa .asr_loop
+
+.done:
+    cpy2sa 4+1,.local_val
+    ; restore X register before returning
+    pls tax
+    rts
+.local_val:
+    .4byte 0
+.sign_byte:
+    .byte 0
diff --git a/examples/slu4-minimal-64/software/math32lib.min64 b/examples/slu4-minimal-64/software/math32lib.min64
index 86f16d4..b3151da 100644
--- a/examples/slu4-minimal-64/software/math32lib.min64
+++ b/examples/slu4-minimal-64/software/math32lib.min64
@@ -81,72 +81,72 @@ compare_int32:
 ;
 
 multiply_uint32:
-    ; uses a logical shift right
-    cpy2ai _shift_right_ptr,_lsr64
+    ; return is always positive
+    ldi 0 sta _multiply_sign_byte
     jpa _multiply
 
 multiply_int32:
-    ; uses an arithmetic shift right
-    cpy2ai _shift_right_ptr,_asr64
-    jpa _multiply
-
-_shift_right_ptr:
-    .2byte 0
-_shift_right_func:
-    jpr _shift_right_ptr
-_lsr64:
-    ; save MSB to do a arithmetic shift right
-    lrb _multiply_working_mem+7
-    rrb _multiply_working_mem+6
-    rrb _multiply_working_mem+5
-    rrb _multiply_working_mem+4
-    rrb _multiply_working_mem+3
-    rrb _multiply_working_mem+2
-    rrb _multiply_working_mem+1
-    rrb _multiply_working_mem+0
-    rts
-_asr64:
-    lrb _multiply_working_mem+7
-    rrb _multiply_working_mem+6
-    rrb _multiply_working_mem+5
-    rrb _multiply_working_mem+4
-    rrb _multiply_working_mem+3
-    rrb _multiply_working_mem+2
-    rrb _multiply_working_mem+1
-    rrb _multiply_working_mem+0
-    ; check if a sign bit needs to be maintained
-    lda _multiply_working_mem+7 ani %01000000 lsl orb _multiply_working_mem+7
+    ; determine if result is going to be negative
+    lds 3 ani %10000000 sta _multiply_sign_byte cpi 0 beq .check_multiplicand
+    ; negate mutiplier (stack is big endian)
+    twos4s 3
+.check_multiplicand:
+    lds 7 ani %10000000 xra _multiply_sign_byte sta _multiply_sign_byte
+    lds 7 ani %10000000 cpi 0 beq .done
+    ; negate mutiplicand (stack is big endian)
+    twos4s 7
 .done:
-    rts
-.sign_bit: .byte 0
+    jpa _multiply
 
+_multiply_sign_byte:
+    .byte 0
 _multiply:
     ; set counter for 32 bits
     ldi 32 sta .counter
     ; set up 8 byte results memory block
-    cpy4ai _multiply_working_mem+4,0     ; high word inialized to 0
-    cpy4as _multiply_working_mem,3       ; multiplier in low word
+    cpy4ai .multiply_working_mem+4,0     ; high word inialized to 0
+    cpy4as .multiply_working_mem,3       ; multiplier in low word
 .mult_loop:
     ; check to see if LSb of working memory is 1
-    lda _multiply_working_mem+0 lsr bcc .continue
+    lda .multiply_working_mem+0 lsr bcc .continue
     ; add high word of results to multiplicand
     phs4s 7
-    phs4a _multiply_working_mem+4
+    phs4a .multiply_working_mem+4
     jps add32
-    cpy4as _multiply_working_mem+4,1
+    cpy4as .multiply_working_mem+4,1
     pls4
     pls4
 .continue:
     ; shift results right one.
-    jps _shift_right_func
+    lrb .multiply_working_mem+7
+    rrb .multiply_working_mem+6
+    rrb .multiply_working_mem+5
+    rrb .multiply_working_mem+4
+    rrb .multiply_working_mem+3
+    rrb .multiply_working_mem+2
+    rrb .multiply_working_mem+1
+    rrb .multiply_working_mem+0
     ; decrement counter (placing it in A) and stop if 0
     deb .counter cpi 0 bne .mult_loop
-.end:
-    cpy4sa 3,_multiply_working_mem+4
-    cpy4sa 7,_multiply_working_mem+0
+.set_sign:
+    ; check to see if result is negative:
+    lda _multiply_sign_byte cpi 0 beq .positive_results
+    lda .multiply_working_mem+0 not inc sts 3+7
+    lda .multiply_working_mem+1 not aci 0 sts 3+6
+    lda .multiply_working_mem+2 not aci 0 sts 3+5
+    lda .multiply_working_mem+3 not aci 0 sts 3+4
+    lda .multiply_working_mem+4 not aci 0 sts 3+3
+    lda .multiply_working_mem+5 not aci 0 sts 3+2
+    lda .multiply_working_mem+6 not aci 0 sts 3+1
+    lda .multiply_working_mem+7 not aci 0 sts 3+0
+    rts
+.positive_results:
+    ; the entire working memory is the 64-bit results
+    cpy4sa 3+0,.multiply_working_mem+4
+    cpy4sa 3+4,.multiply_working_mem+0
     rts
 .counter: .byte 0
-_multiply_working_mem: .zero 8
+.multiply_working_mem: .zero 8
 
 ; divide32
 ;   Divides X by Y (note, unsigned only)