From 295ff46298a885c8bb8152f06d512686814a7f92 Mon Sep 17 00:00:00 2001 From: Michael Kamprath Date: Sun, 6 Aug 2023 20:38:59 -0700 Subject: [PATCH] added 16-bit version of mandelbrot for minimal 64 --- examples/slu4-minimal-64/README.md | 10 + examples/slu4-minimal-64/slu4-minimal-64.yaml | 106 ++++- .../software/mandelbrot16.min64 | 183 +++++++++ .../{mandelbrot.min64 => mandelbrot32.min64} | 0 .../slu4-minimal-64/software/math16lib.min64 | 374 ++++++++++++++++++ .../slu4-minimal-64/software/math32lib.min64 | 88 ++--- 6 files changed, 713 insertions(+), 48 deletions(-) create mode 100644 examples/slu4-minimal-64/software/mandelbrot16.min64 rename examples/slu4-minimal-64/software/{mandelbrot.min64 => mandelbrot32.min64} (100%) create mode 100644 examples/slu4-minimal-64/software/math16lib.min64 diff --git a/examples/slu4-minimal-64/README.md b/examples/slu4-minimal-64/README.md index 2b271a4..2f67db4 100644 --- a/examples/slu4-minimal-64/README.md +++ b/examples/slu4-minimal-64/README.md @@ -42,8 +42,13 @@ The following instruction macros have been added in the ISA configuration file f | `phs4s` | stack offset | - | Push onto stack 4 byte value currently found at indicated stack offset | | `pls2` | - | - | Pull 2 bytes from stack. Last byte pulled will be in A register. | | `pls4` | - | - | Pull 4 bytes from stack. Last byte pulled will be in A register. | +| `cpyaa` | absolute address | absolute address | Copies a single byte value from one absolute address (second operand) to another (first operand). | `cpy2as` | absolute address | stack offset | Copy 2 bytes of data sourced from indicated stack offset to memory starting at indicated absolute address. Convert from stack big endian ordering to RAM little endian ordering. | | `cpy2sa` | stack offset | absolute address | Copy 2 bytes of data sourced from absolute address to stack at indicated offset. Convert from RAM little endian to stack big endian ordering ordering. | +| `cpy2ai` | absolute address | immediate | Copy 2 bytes of immediate value to memory starting at indicated absolute address. Preserves endian ordering. | +| `cpy2si` | stack offset | immediate | Copy 2 bytes of immediate value to stack at indicated offset. Convert from RAM little endian to stack big endian ordering ordering. | +| `cpy2ss` | stack offset | stack offset | Copy 2 bytes of data from stack starting at indicated offset (2nd operand) to another location in stack starting at indicated offset (1rst operand). Byte ordering is preserved. | +| `cpy2aa` | absolute address | absolute address | Copy 2 bytes starting at source address (secord operand) to destination address (first operand) | | `cpy4as` | absolute address | stack offset | Copy 4 bytes of data sourced from indicated stack offset to memory starting at indicated absolute address. Convert from stack big endian ordering to RAM little endian ordering. | | `cpy4sa` | stack offset | absolute address | Copy 4 bytes of data sourced from absolute address to stack at indicated offset. Convert from RAM little endian to stack big endian ordering ordering. | | `cpy4ai` | absolute address | immediate | Copy 4 bytes of immediate value to memory starting at indicated absolute address. Preserves endian ordering. | @@ -52,6 +57,11 @@ The following instruction macros have been added in the ISA configuration file f | `cpy4aa` | absolute address | absolute address | Copy 4 bytes starting at source address (secord operand) to destination address (first operand) | | `inc16a` | absolute address | - | Increment the two byte integer value found at the absolute address | | `inc32a` | absolute address | - | Increment the two byte integer value found at the absolute address | +| `twos2s` | stack offset | - | Calculates the two's complement of a 2 byte value at a given offset into the stack and updates it in place. | +| `twos2a` | absolute address | - | Calculates the two's complement of a 2 byte value at indicated absolute address and updates it in place. | +| `twos4s` | stack offset | - | Calculates the two's complement of a 4 byte value at a given offset into the stack and updates it in place. | +| `twos4a` | absolute address | - | Calculates the two's complement of a 4 byte value at indicated absolute address and updates it in place. | + The operand descriptions use the definitions provided by documentation for Minimal 64. You should assume the accumulator (register `A`) is not preserved across any of these macros. diff --git a/examples/slu4-minimal-64/slu4-minimal-64.yaml b/examples/slu4-minimal-64/slu4-minimal-64.yaml index b9d4b80..e9aa533 100644 --- a/examples/slu4-minimal-64/slu4-minimal-64.yaml +++ b/examples/slu4-minimal-64/slu4-minimal-64.yaml @@ -1382,6 +1382,31 @@ macros: - "sta @ARG(0)+0" - "ldi BYTE1(@ARG(1))" - "sta @ARG(0)+1" + cpy2si: + - operands: + count: 2 + operand_sets: + list: + - immediate_8bit + - immediate_16bit + instructions: + # stack is big endian + - "ldi BYTE0(@ARG(1))" + - "sts @ARG(0)+1" + - "ldi BYTE1(@ARG(1))" + - "sts @ARG(0)+0" + cpy2ss: + - operands: + count: 2 + operand_sets: + list: + - immediate_8bit + - immediate_8bit + instructions: + - "lds @ARG(1)+0" + - "sts @ARG(0)+0" + - "lds @ARG(1)+1" + - "sts @ARG(0)+1" cpy2aa: - operands: count: 2 @@ -1454,14 +1479,15 @@ macros: - immediate_8bit - immediate_32bit instructions: + # stack is big endian - "ldi BYTE0(@ARG(1))" - - "sts @ARG(0)+0" + - "sts @ARG(0)+3" - "ldi BYTE1(@ARG(1))" - - "sts @ARG(0)+1" - - "ldi BYTE2(@ARG(1))" - "sts @ARG(0)+2" + - "ldi BYTE2(@ARG(1))" + - "sts @ARG(0)+1" - "ldi BYTE3(@ARG(1))" - - "sts @ARG(0)+3" + - "sts @ARG(0)+0" cpy4ss: - operands: count: 2 @@ -1519,3 +1545,75 @@ macros: - "acb @ARG(0)+2" - "ldi 0" - "acb @ARG(0)+3" + + twos2s: + # calculates the two's complement of the 2 byte value at offset in stack + - operands: + count: 1 + operand_sets: + list: + - immediate_8bit + instructions: + - "lds @ARG(0)+1" + - "not" + - "inc" + - "sts @ARG(0)+1" + - "lds @ARG(0)+0" + - "not" + - "aci 0" + - "sts @ARG(0)+0" + twos2a: + # calculates the two's complement of the 2 byte value at absolute address + - operands: + count: 1 + operand_sets: + list: + - absolute_address + instructions: + - "nob @ARG(0)+0" + - "inb @ARG(0)+0" + - "nob @ARG(0)+1" + - "acb @ARG(0)+1" + twos4s: + # calculates the two's complement of the 4 byte value at offset in stack + - operands: + count: 1 + operand_sets: + list: + - immediate_8bit + instructions: + - "lds @ARG(0)+3" + - "not" + - "inc" + - "sts @ARG(0)+3" + - "lds @ARG(0)+2" + - "not" + - "aci 0" + - "sts @ARG(0)+2" + - "lds @ARG(0)+1" + - "not" + - "aci 0" + - "sts @ARG(0)+1" + - "lds @ARG(0)+0" + - "not" + - "aci 0" + - "sts @ARG(0)+0" + twos4a: + # calculates the two's complement of the 4 byte value at absolute address + - operands: + count: 1 + operand_sets: + list: + - absolute_address + instructions: + - "nob @ARG(0)+0" + - "inb @ARG(0)+0" + - "nob @ARG(0)+1" + - "ldi 0" + - "acb @ARG(0)+1" + - "nob @ARG(0)+2" + - "ldi 0" + - "acb @ARG(0)+2" + - "nob @ARG(0)+3" + - "ldi 0" + - "acb @ARG(0)+3" diff --git a/examples/slu4-minimal-64/software/mandelbrot16.min64 b/examples/slu4-minimal-64/software/mandelbrot16.min64 new file mode 100644 index 0000000..7acd792 --- /dev/null +++ b/examples/slu4-minimal-64/software/mandelbrot16.min64 @@ -0,0 +1,183 @@ +; Mandelbrot for the Minimal 64 Home Computer +; +; Approach is to used fixed point math to only use integer operations. A detailed +; explanation of this approach can be found here: +; +; https://github.com/rahra/intfract +; +#require "slu4-min64-asm >= 1.2.0" + +IMAGE_X_PIXELS = 400 +IMAGE_Y_PIXELS = 240 + +SCALE_BITS = 9 +SCALE_FACTOR = (1 << SCALE_BITS) + +MANDELBROT_START_X = -2*SCALE_FACTOR +MANDELBROT_END_X = 1*SCALE_FACTOR +MANDELBROT_STEP_X = (MANDELBROT_END_X - MANDELBROT_START_X)/IMAGE_X_PIXELS + +MANDELBROT_START_Y = -1*SCALE_FACTOR +MANDELBROT_END_Y = 1*SCALE_FACTOR +MANDELBROT_STEP_Y = (MANDELBROT_END_Y - MANDELBROT_START_Y)/IMAGE_Y_PIXELS + +MAX_ITERATIONS = $FF + + +.org $8000 +init: + spinit ; init stack + jps _Clear + cpy2ai cur_pixel_x,0 + cpy2ai cur_pixel_y,0 + +.pixel_loop_y: + ; calculate scaled y0 + phs2a cur_pixel_y + phs2i MANDELBROT_STEP_Y + jps multiply_int16 ; results are 32 bit + pls2 + phs2i MANDELBROT_START_Y + jps add16 + cpy2as scaled_y0,1 ; fetch results + pls2 pls2 ; discard upper 32 bits of multiplications + +.pixel_loop_x: + ; calcualted scaled x0 + phs2a cur_pixel_x + phs2i MANDELBROT_STEP_X + jps multiply_int16 ; results are 32 bit + pls2 + phs2i MANDELBROT_START_X + jps add16 + cpy2as scaled_x0,1 ; fetch results + pls2 pls2 ; + + ; check if in mandelbrot set +.init_mandelbrot: + ; start interations + ldi 0 sta iteration_count + ; initialize zx and zy + cpy2aa zx,scaled_x0 + cpy2aa zy,scaled_y0 + + ; push pixel coordinates on stack + phsa cur_pixel_x+0 + phsa cur_pixel_x+1 + phsa cur_pixel_y+0 + jps _SetPixel + pls pls pls + +.mandelbrot_loop: + ; find zx*zx + zy*zy + phs2a zx + phs2a zx + jps multiply_int16 + phsi SCALE_BITS jps asr32n pls ; rescale + pls2 + cpy2as zx_squared,1 + pls2 + + phs2a zy + phs2a zy + jps multiply_int16 + phsi SCALE_BITS jps asr32n pls ; rescale + pls2 + cpy2as zy_squared,1 + phs2a zx_squared + jps add16 + cpy2as temp_int16,1 + pls2 pls2 + + ; check if value is greater than NOT_MANDELBROT_THRESHOLD + phs2a temp_int16 ; left value + phs2i 4*SCALE_FACTOR ; right value + jps compare_uint16 + pls2 pls2 + bgt .not_in_mandelbrot + + ; increment counter and check count + inb iteration_count + lda iteration_count cpi MAX_ITERATIONS + beq .in_mandelbrot ; if we are at max iterations, point is in set + + ; set up for next mandelbrot iteration + + ; zy = 2*zx*zy + scaled_y0 + phs2a zx + phs2a zy + jps multiply_int16 + phsi (SCALE_BITS-1) jps asr32n pls ; rescale + pls2 ; remove top 4 bytes + ; stack now contains 2*zx*zy + phs2a scaled_y0 + jps add16 + cpy2as zy,1 + pls2 pls2 + + ; zx = zx*zx - zy*zy + scaled_x0, but store in temp for now + phs2a zy_squared ; Y value + phs2a zx_squared ; X value + jps subtract16 ; X-Y + phs2a scaled_x0 + jps add16 + cpy2as zx,1 ; the new zx value + pls2 pls2 pls2 + + ; next loop + jpa .mandelbrot_loop + +.in_mandelbrot: + ; push pixel coordinates on stack + phsa cur_pixel_x+0 + phsa cur_pixel_x+1 + phsa cur_pixel_y+0 + jps _ClrPixel + pls pls pls ; remove pixel coordinates from stack +.not_in_mandelbrot: +.mandelbot_pixel_done: + +.pixel_loop_x_end: + ; next x pixel + inc16a cur_pixel_x + ; check to see if we are done with current x row + phs2i IMAGE_X_PIXELS + phs2a cur_pixel_x + jps compare_uint16 + pls2 pls2 + bne .pixel_loop_x + cpy2ai cur_pixel_x,0 + +.pixel_loop_y_end: + ; next y pixel + inc16a cur_pixel_y + ; check to see if we are done overall + phs2i IMAGE_Y_PIXELS + phs2a cur_pixel_y + jps compare_uint16 + pls2 pls2 + bne .pixel_loop_y + +.looping_done: + ldi 0 sta _XPos ldi 29 sta _YPos + jps _ScrollUp + jpa _Prompt + +; +; Variables +; + +cur_pixel_x: .2byte 0 +cur_pixel_y: .2byte 0 +scaled_x0: .2byte 0 +scaled_y0: .2byte 0 +zx: .2byte 0 +zy: .2byte 0 +zx_squared: .2byte 0 +zy_squared: .2byte 0 +temp_int16: .2byte 0 +iteration_count: .byte 0 + +#include "math16lib.min64" +#include "math32lib.min64" +#include "stringlib.min64" diff --git a/examples/slu4-minimal-64/software/mandelbrot.min64 b/examples/slu4-minimal-64/software/mandelbrot32.min64 similarity index 100% rename from examples/slu4-minimal-64/software/mandelbrot.min64 rename to examples/slu4-minimal-64/software/mandelbrot32.min64 diff --git a/examples/slu4-minimal-64/software/math16lib.min64 b/examples/slu4-minimal-64/software/math16lib.min64 new file mode 100644 index 0000000..1509111 --- /dev/null +++ b/examples/slu4-minimal-64/software/math16lib.min64 @@ -0,0 +1,374 @@ +#require "slu4-min64-asm >= 1.2.0" + +; compare_uint16 +; Compares two unsigned 16-bit values to determine equality +; X ? Y +; +; Arguments +; sp+3 : right Y value (2 bytes) +; sp+5 : left X value (2 bytes) +; +; Returns +; flags will be set per comparison +; +compare_uint16: + ; first check high bytes, then others in sequence + ; values on stack are stored big endian + lds (3+0) sta .rval lds (5+0) cpa .rval bne .done + lds (3+1) sta .rval lds (5+1) cpa .rval +.done: + rts +.rval: .byte 0 + +; compare_int16 +; Compares two signed 16-bit values to determine equality +; X ? Y +; +; Arguments +; sp+3 : right Y value (2 bytes) +; sp+5 : left X value (2 bytes) +; +; Returns +; flags will be set per comparison +; +compare_int16: + ; first check signs. if LHS is negative, check RHS sign + lds (5+0) ani %1000000 cpi 0 beq .lhs_positive + ; LHS is negative, check RHS + lds (3+0) ani %1000000 cpi 0 beq .lhs_negative_rhs_positive +.lhs_negative_rhs_negative: + ; swap LHS and RHS in camparisons so flag sense will be correct + lds (5+0) sta .rval lds (3+0) cpa .rval bne .done + lds (5+1) sta .rval lds (3+1) cpa .rval + jpa .done +.lhs_negative_rhs_positive: + ; LHS is less than RHS. do comparison of signed bits to get right flags + ldi 0 cpi 1 + jpa .done +.lhs_positive: + ; LHS is positive, check RHS + lds (3+0) ani %1000000 cpi 0 beq .lhs_positive_rhs_positive +.lhs_positive_rhs_negative: + ; LHS is greater than RHS. do comparison of signed bits to get right flags + ldi 1 cpi 0 + jpa .done +.lhs_positive_rhs_positive: + lds (3+0) sta .rval lds (5+0) cpa .rval bne .done + lds (3+1) sta .rval lds (5+1) cpa .rval +.done: + rts +.rval: .byte 0 + + +; multiply_uint16 +; multiply unsigned 2 byte values X*Y, producing an 4 byte unsigned results +; +; multiply_int32 +; multiply signed 2 byte values X*Y, producing an 4 byte signed results +; +; Arguments +; sp+3 - value X (multiplier) (2 bytes) +; sp+5 - value Y (multiplicand) (2 bytes) +; +; Return Value +; sp+3 - results (4 bytes) +; + +multiply_uint16: + ; return is always positive + ldi 0 sta _multiply_sign_byte + jpa _multiply + +multiply_int16: + ; determine if result is going to be negative + lds 3 ani %10000000 sta _multiply_sign_byte cpi 0 beq .check_multiplicand + ; negate mutiplier (stack is big endian) + twos2s 3 +.check_multiplicand: + lds 5 ani %10000000 xra _multiply_sign_byte sta _multiply_sign_byte + lds 5 ani %10000000 cpi 0 beq .done + ; negate mutiplicand (stack is big endian) + twos2s 5 +.done: + jpa _multiply +_multiply_sign_byte: + .byte 0 +_multiply: + ; set counter for 32 bits + ldi 16 sta .counter + ; set up 8 byte results memory block + cpy2ai .multiply_working_mem+2,0 ; high word inialized to 0 + cpy2as .multiply_working_mem,3 ; multiplier in low word + ; ; check to see if multiplier is negative + lda .multiply_working_mem+1 ani %10000000 cpi 0 beq .mult_loop + cpy2ai .multiply_working_mem+2,$FFFF ; set high word to negative +.mult_loop: + ; check to see if LSb of working memory is 1 + lda .multiply_working_mem+0 lsr bcc .continue + ; add high word of results to multiplicand + phs2s 5 + phs2a .multiply_working_mem+2 + jps add16 + cpy2as .multiply_working_mem+2,1 + pls2 + pls2 +.continue: + ; shift results right one. + lrb .multiply_working_mem+3 + rrb .multiply_working_mem+2 + rrb .multiply_working_mem+1 + rrb .multiply_working_mem+0 + ; decrement counter (placing it in A) and stop if 0 + deb .counter cpi 0 bne .mult_loop +.set_sign: + ; check to see if result is negative: + lda _multiply_sign_byte cpi 0 beq .positive_results + lda .multiply_working_mem+0 not inc sts 3+3 + lda .multiply_working_mem+1 not aci 0 sts 3+2 + lda .multiply_working_mem+2 not aci 0 sts 3+1 + lda .multiply_working_mem+3 not aci 0 sts 3+0 + rts +.positive_results: + ; the entire working memory is the 32-bit results + cpy4sa 3,.multiply_working_mem + rts +.counter: .byte 0 +.multiply_working_mem: .zero 4 + +; divide16 +; Divides X by Y (note, unsigned only) +; +; Arguments: +; sp+3 : value X dividend (2 bytes) +; sp+5 : value Y divisor (2 bytes) +; +; Return Value: +; sp+3 : the quotient (replaces X) +; sp+5 : the remainder (replaces Y) +; +divide16: + ; first check values for 0 + phs2i 0 + phs2s (5+2) + jps compare_uint16 + pls2 + beq .divide_by_zero + phs2s (3+2) + jps compare_uint16 + pls2 + pls2 + beq .return_zero + ; check if divisor > dividend + phs2s (5+0) + phs2s (3+2) + jps compare_uint16 + pls2 + pls2 + bgt .divisor_too_large +.start_division: + ; set up working stack: + ; little endian + ; 0 : low word (2 bytes) --> becomes quotient + ; 2 : high word (2 bytes) --> becomes remainder + ldi 0 sta .carry_bit ; init carry bit + cpy2ai .working_mem+2, 0 ; init high word + cpy2as .working_mem+0, 3 ; init low word with dividend + ldi 16 sta .counter ; init loop counter +.div_loop: + ; shift working memory and add carry bit to the right side + jps .div_lsl32 + lda .working_mem+0 + ada .carry_bit + sta .working_mem+0 + ldi 0 sta .carry_bit + ; determine if we can do subtraction + phs2s 5 ; divisor (left) + phs2a .working_mem+2 ; working value high word (right) + jps compare_uint16 + bgt .div_loop_continue +.div_loop_subtraction: + ; working value is equal to or larger than divsior + ; do the subtraction + jps subtract16 + ; save subtraction results to high word and set carry bit + cpy2as .working_mem+2, 1 + ldi 1 sta .carry_bit +.div_loop_continue: + ; clear stack + pls2 + pls2 + ; decrement counter and check for 0 + deb .counter + lda .counter cpi 0 bne .div_loop +.division_done: + ; at this point we have the remainder in the high word, save it + cpy2sa 5,.working_mem+2 + ; and then we left shift one more time to get the quotient + jps .div_lsl32 + lda .working_mem+0 + ada .carry_bit + sta .working_mem+0 + cpy2sa 3,.working_mem+0 + rts +.divisor_too_large: + ; quotient = 0, remander = dividend + cpy2ss 5, 3 + cpy2si 3, 0 + rts +.divide_by_zero: + ; for now, just return 0 + pls2 + pls + cpy2si 5, 0 +.return_zero: + cpy2si 3, 0 + rts +.working_mem: .zero 4 +.carry_bit: .byte 0 +.counter: .byte 0 +; .div_lsl64 +; +; local method for shifting .working_mem left 1 bit +.div_lsl32: + llb .working_mem+0 + rlb .working_mem+1 + rlb .working_mem+2 + rlb .working_mem+3 + rts + + + +; add16 +; adds Y value to X (X+Y) +; +; Arguments +; sp+3 - value X, 2 byte value, signed or unsigned +; sp+5 - value Y, 2 byte value, signed or unsigned +; +; Return Value +; sp+3 - replace the original 2 byte value with the sum +; +add16: + ; stack is big endian, save locally little endian + cpy2as .yval, 5 + ; start addition with LSB, Remember, stack is big endian + lds 3+1 ada .yval+0 sts 3+1 + lds 3+0 aca .yval+1 sts 3+0 + ; end return + rts +.yval: .2byte 0 + + +; subtract16 +; subtracts Y value from X (X-Y) +; +; Arguments +; sp+3 - value X, 2 byte value +; sp+5 - value Y, 2 byte value +; +; Return Value +; sp+3 - replace the original 4 byte value with the difference +; +; +subtract16: + ; stack is big endian, save Y locally little endian + cpy2as .yval, 5 + ; star subtractiosn with LSB + lds 3+1 sba .yval+0 sts 3+1 + lds 3+0 sca .yval+1 sts 3+0 + ; end return + rts +.yval: .2byte 0 + +; lsr16n +; logical shift right N bits for 32 bit values +; +; Arguments +; sp+3 - count of bits to shift right. 1 byte value, masked to %00001111, if great return zero +; sp+4 - value to be shifted right, 2 byte value +; +; Return Value +; sp+4 - replace value with shifted value +; * Does not porperly set carry flag +; +lsr16n: + ; first, save X register + txa phs + ; set up local variables + cpy2as .local_val,4+1 + lds 3+1 tax + + ; Now see if we can save iterations by shortcutting the right shift +.iteration_saver: + ldi 7 cpx bgt .lsr_loop ; if the current bit shift count is <=7, then start doing loop + ; we need to shift 8 or more bits. Fast track a 8-bit shift by dropping LSB + cpyaa .local_val+0,.local_val+1 + ldi 0 sta .local_val+1 + ; decrement the shift counter by 8 + txa sbi 8 tax +.lsr_loop: + ; check the iteration counter is > 0, otherwise do a bit shift and decrement counter. + ldi 0 cpx beq .done + ; right shifts start from the MSB + lrb .local_val+1 + rrb .local_val+0 + dex + jpa .lsr_loop +.done: + cpy2sa 4+1,.local_val + ; restore X register before returning + pls tax + rts +.local_val: + .2byte 0 + + +; asr16n +; arithmetic shift right N bits for 32 bit values +; +; Arguments +; sp+3 - count of bits to shift right. 1 byte value, masked to %00001111, if great return zero +; sp+4 - value to be shifted right, 2 byte value +; +; Return Value +; sp+4 - replace value with shifted value +; * Does not porperly set carry flag +; +asr16n: + ; first, save X register + txa phs + ; set up local variables + cpy2as .local_val,4+1 + lds 3+1 tax + ; Now see if we can save iterations by shortcutting the right shift +.iteration_saver: + ldi 7 cpx bgt .asr_loop ; if the current bit shift count is <=7, then start doing loop + ; decrement the shift counter by 8 + txa sbi 8 tax + ; we need to shift 8 or more bits. Fast track a 8-bit shift by dropping LSB + cpyaa .local_val+0,.local_val+1 + lda .local_val+1 ani %10000000 cpi 0 beq .zero_sign_byte + ldi $FF sta .local_val+1 + jpa .asr_loop +.zero_sign_byte: + ldi 0 sta .local_val+1 +.asr_loop: + ; check the iteration counter is > 0, otherwise do a bit shift and decrement counter. + ldi 0 cpx beq .done + ; right shifts start from the MSB + lrb .local_val+1 + rrb .local_val+0 + lda .local_val+1 ani %01000000 lsl orb .local_val+1 +.continue_asl: + dex + jpa .asr_loop + +.done: + cpy2sa 4+1,.local_val + ; restore X register before returning + pls tax + rts +.local_val: + .4byte 0 +.sign_byte: + .byte 0 diff --git a/examples/slu4-minimal-64/software/math32lib.min64 b/examples/slu4-minimal-64/software/math32lib.min64 index 86f16d4..b3151da 100644 --- a/examples/slu4-minimal-64/software/math32lib.min64 +++ b/examples/slu4-minimal-64/software/math32lib.min64 @@ -81,72 +81,72 @@ compare_int32: ; multiply_uint32: - ; uses a logical shift right - cpy2ai _shift_right_ptr,_lsr64 + ; return is always positive + ldi 0 sta _multiply_sign_byte jpa _multiply multiply_int32: - ; uses an arithmetic shift right - cpy2ai _shift_right_ptr,_asr64 - jpa _multiply - -_shift_right_ptr: - .2byte 0 -_shift_right_func: - jpr _shift_right_ptr -_lsr64: - ; save MSB to do a arithmetic shift right - lrb _multiply_working_mem+7 - rrb _multiply_working_mem+6 - rrb _multiply_working_mem+5 - rrb _multiply_working_mem+4 - rrb _multiply_working_mem+3 - rrb _multiply_working_mem+2 - rrb _multiply_working_mem+1 - rrb _multiply_working_mem+0 - rts -_asr64: - lrb _multiply_working_mem+7 - rrb _multiply_working_mem+6 - rrb _multiply_working_mem+5 - rrb _multiply_working_mem+4 - rrb _multiply_working_mem+3 - rrb _multiply_working_mem+2 - rrb _multiply_working_mem+1 - rrb _multiply_working_mem+0 - ; check if a sign bit needs to be maintained - lda _multiply_working_mem+7 ani %01000000 lsl orb _multiply_working_mem+7 + ; determine if result is going to be negative + lds 3 ani %10000000 sta _multiply_sign_byte cpi 0 beq .check_multiplicand + ; negate mutiplier (stack is big endian) + twos4s 3 +.check_multiplicand: + lds 7 ani %10000000 xra _multiply_sign_byte sta _multiply_sign_byte + lds 7 ani %10000000 cpi 0 beq .done + ; negate mutiplicand (stack is big endian) + twos4s 7 .done: - rts -.sign_bit: .byte 0 + jpa _multiply +_multiply_sign_byte: + .byte 0 _multiply: ; set counter for 32 bits ldi 32 sta .counter ; set up 8 byte results memory block - cpy4ai _multiply_working_mem+4,0 ; high word inialized to 0 - cpy4as _multiply_working_mem,3 ; multiplier in low word + cpy4ai .multiply_working_mem+4,0 ; high word inialized to 0 + cpy4as .multiply_working_mem,3 ; multiplier in low word .mult_loop: ; check to see if LSb of working memory is 1 - lda _multiply_working_mem+0 lsr bcc .continue + lda .multiply_working_mem+0 lsr bcc .continue ; add high word of results to multiplicand phs4s 7 - phs4a _multiply_working_mem+4 + phs4a .multiply_working_mem+4 jps add32 - cpy4as _multiply_working_mem+4,1 + cpy4as .multiply_working_mem+4,1 pls4 pls4 .continue: ; shift results right one. - jps _shift_right_func + lrb .multiply_working_mem+7 + rrb .multiply_working_mem+6 + rrb .multiply_working_mem+5 + rrb .multiply_working_mem+4 + rrb .multiply_working_mem+3 + rrb .multiply_working_mem+2 + rrb .multiply_working_mem+1 + rrb .multiply_working_mem+0 ; decrement counter (placing it in A) and stop if 0 deb .counter cpi 0 bne .mult_loop -.end: - cpy4sa 3,_multiply_working_mem+4 - cpy4sa 7,_multiply_working_mem+0 +.set_sign: + ; check to see if result is negative: + lda _multiply_sign_byte cpi 0 beq .positive_results + lda .multiply_working_mem+0 not inc sts 3+7 + lda .multiply_working_mem+1 not aci 0 sts 3+6 + lda .multiply_working_mem+2 not aci 0 sts 3+5 + lda .multiply_working_mem+3 not aci 0 sts 3+4 + lda .multiply_working_mem+4 not aci 0 sts 3+3 + lda .multiply_working_mem+5 not aci 0 sts 3+2 + lda .multiply_working_mem+6 not aci 0 sts 3+1 + lda .multiply_working_mem+7 not aci 0 sts 3+0 + rts +.positive_results: + ; the entire working memory is the 64-bit results + cpy4sa 3+0,.multiply_working_mem+4 + cpy4sa 3+4,.multiply_working_mem+0 rts .counter: .byte 0 -_multiply_working_mem: .zero 8 +.multiply_working_mem: .zero 8 ; divide32 ; Divides X by Y (note, unsigned only)