Skip to content

Commit

Permalink
Generate fpu functions at startup instead of at every fpu instruction…
Browse files Browse the repository at this point in the history
… encountered + fixed a bug in FLD (register version)
  • Loading branch information
ergo720 committed Jan 31, 2024
1 parent 3d48ad8 commit 3c25046
Show file tree
Hide file tree
Showing 8 changed files with 187 additions and 117 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ set(HEADERS
"${LIB86CPU_ROOT_DIR}/lib86cpu/core/allocator.h"
"${LIB86CPU_ROOT_DIR}/lib86cpu/core/breakpoint.h"
"${LIB86CPU_ROOT_DIR}/lib86cpu/core/decode.h"
"${LIB86CPU_ROOT_DIR}/lib86cpu/core/fpu.h"
"${LIB86CPU_ROOT_DIR}/lib86cpu/core/helpers.h"
"${LIB86CPU_ROOT_DIR}/lib86cpu/core/instructions.h"
"${LIB86CPU_ROOT_DIR}/lib86cpu/core/internal.h"
Expand Down
14 changes: 13 additions & 1 deletion lib86cpu/core/emitter/emitter_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,5 +203,17 @@ inline constexpr auto all_callable_funcs = std::make_tuple(
fpu_update_tag,
cpu_runtime_abort,
dbg_update_exp_hook,
tlb_invalidate_
tlb_invalidate_,
fpu_stack_check<true, fpu_instr_t::integer8>,
fpu_stack_check<false, fpu_instr_t::integer8>,
fpu_stack_check<true, fpu_instr_t::integer16>,
fpu_stack_check<false, fpu_instr_t::integer16>,
fpu_stack_check<true, fpu_instr_t::integer32>,
fpu_stack_check<false, fpu_instr_t::integer32>,
fpu_stack_check<true, fpu_instr_t::integer64>,
fpu_stack_check<false, fpu_instr_t::integer64>,
fpu_stack_check<true, fpu_instr_t::float_>,
fpu_stack_check<false, fpu_instr_t::float_>,
fpu_stack_check<true, fpu_instr_t::bcd>,
fpu_stack_check<false, fpu_instr_t::bcd>
);
160 changes: 58 additions & 102 deletions lib86cpu/core/emitter/x64/jit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ static_assert((LOCAL_VARS_off(0) & 15) == 0); // must be 16 byte aligned so that

#define LD_MEM() load_mem(m_cpu->size_mode, 0)
#define LD_MEMs(size) load_mem(size, 0)
#define LD_MEM80i(idx) load_mem(SIZE80, 0, idx)
#define LD_MEM128() load_mem(SIZE128, 0)
#define ST_MEM(val) store_mem(val, m_cpu->size_mode, 0)
#define ST_MEMs(val, size) store_mem(val, size, 0)
Expand Down Expand Up @@ -456,6 +457,13 @@ static_assert((LOCAL_VARS_off(0) & 15) == 0); // must be 16 byte aligned so that
#define RESTORE_FPU_CTX() FLDCW(MEMD16(RSP, LOCAL_VARS_off(5)))
#define CALL_F(func) MOV(RAX, func); CALL(RAX); RELOAD_RCX_CTX()

#define CALL_FPU_SET_CTX() MOV(RAX, m_cpu->set_host_fpu_ctx_fn); CALL(RAX)
#define CALL_FPU_EXP_CHK() MOV(RAX, m_cpu->fpu_exp_post_check_fn); CALL(RAX)
#define CALL_FPU_STACK_CHK(is_push, instr_ty) LEA(R8, MEMD64(RSP, LOCAL_VARS_off(0))); \
LEA(RDX, MEMD64(RSP, LOCAL_VARS_off(2))); \
CALL_F((&fpu_stack_check<is_push, instr_ty>)); \
MOV(EBX, EAX); \
MOV(R8D, MEMD32(RSP, LOCAL_VARS_off(2)));

lc86_jit::lc86_jit(cpu_t *cpu)
{
Expand Down Expand Up @@ -552,27 +560,41 @@ lc86_jit::gen_aux_funcs()
MOV(EAX, MEMD32(RCX, CPU_CTX_INT));
RET();

// raise any int
size_t raise_int_off = m_a.offset(), raise_int_off_aligned16 = (raise_int_off + 15) & ~15;
if (raise_int_off_aligned16 > raise_int_off) {
for (unsigned i = 0; i < (raise_int_off_aligned16 - raise_int_off); ++i) {
INT3();
const auto &align_next_func_start = [this]() {
size_t off = m_a.offset(), off_aligned16 = (off + 15) & ~15;
if (off_aligned16 > off) {
for (unsigned i = 0; i < (off_aligned16 - off); ++i) {
INT3();
}
}
}
return off_aligned16;
};

// raise any int
size_t raise_int_off_aligned16 = align_next_func_start();
m_a.lock().or_(MEMD32(RCX, CPU_CTX_INT), EDX);
RET();

// clear any int
size_t clear_int_off = m_a.offset(), clear_int_off_aligned16 = (clear_int_off + 15) & ~15;
if (clear_int_off_aligned16 > clear_int_off) {
for (unsigned i = 0; i < (clear_int_off_aligned16 - clear_int_off); ++i) {
INT3();
}
}
size_t clear_int_off_aligned16 = align_next_func_start();
NOT(EDX);
m_a.lock().and_(MEMD32(RCX, CPU_CTX_INT), EDX);
RET();

// We generate the following fpu related functions once here, to avoid having to generate them at every guest fpu encountered. We cannot use host helpers for these
// because the host needs to mirror the guest control word state to the host when it emulates a guest fpu instruction, and the WIN64 calling convention
// states that the control word is non-volatile across function calls

// gen_set_host_fpu_ctx
size_t gen_set_host_fpu_ctx_off_aligned16 = align_next_func_start();
gen_set_host_fpu_ctx();
RET();

// gen_fpu_exp_post_check
size_t gen_fpu_exp_post_check_off_aligned16 = align_next_func_start();
gen_fpu_exp_post_check();
RET();

if (auto err = m_code.flatten()) {
std::string err_str("Asmjit failed at flatten() with the error ");
err_str += DebugUtils::errorAsString(err);
Expand Down Expand Up @@ -611,6 +633,8 @@ lc86_jit::gen_aux_funcs()
m_cpu->read_int_fn = reinterpret_cast<read_int_t>(static_cast<uint8_t *>(block.addr) + offset);
m_cpu->raise_int_fn = reinterpret_cast<raise_int_t>(static_cast<uint8_t *>(block.addr) + offset + raise_int_off_aligned16);
m_cpu->clear_int_fn = reinterpret_cast<clear_int_t>(static_cast<uint8_t *>(block.addr) + offset + clear_int_off_aligned16);
m_cpu->set_host_fpu_ctx_fn = reinterpret_cast<fpu_func_t>(static_cast<uint8_t *>(block.addr) + offset + gen_set_host_fpu_ctx_off_aligned16);
m_cpu->fpu_exp_post_check_fn = reinterpret_cast<fpu_func_t>(static_cast<uint8_t *>(block.addr) + offset + gen_fpu_exp_post_check_off_aligned16);
}

void
Expand Down Expand Up @@ -1857,15 +1881,15 @@ void lc86_jit::store_reg(T val, size_t reg_offset, size_t size)
}

void
lc86_jit::load_mem(uint8_t size, uint8_t is_priv)
lc86_jit::load_mem(uint8_t size, uint8_t is_priv, uint8_t stack_idx_80_128)
{
// RCX: cpu_ctx, EDX: addr, R8: instr_eip, R9B: is_priv
// for SIZE128/80 -> RCX: ptr to stack-allocated uint128/80_t, RDX: cpu_ctx, R8: addr, R9: instr_eip, stack: is_priv

switch (size)
{
case SIZE128:
LEA(RCX, MEMD64(RSP, LOCAL_VARS_off(0)));
LEA(RCX, MEMD64(RSP, get_local_var_offset(stack_idx_80_128)));
MOV(R8D, EDX);
MOV(RDX, &m_cpu->cpu_ctx);
MOV(R9D, m_cpu->instr_eip);
Expand All @@ -1874,7 +1898,7 @@ lc86_jit::load_mem(uint8_t size, uint8_t is_priv)
break;

case SIZE80:
LEA(RCX, MEMD64(RSP, LOCAL_VARS_off(0)));
LEA(RCX, MEMD64(RSP, get_local_var_offset(stack_idx_80_128)));
MOV(R8D, EDX);
MOV(RDX, &m_cpu->cpu_ctx);
MOV(R9D, m_cpu->instr_eip);
Expand Down Expand Up @@ -2351,81 +2375,6 @@ lc86_jit::gen_simd_mem_align_check()
m_a.bind(ok);
}

template<bool is_push, fpu_instr_t instr_type>
void lc86_jit::gen_fpu_stack_check()
{
// this function places in EBX the value of the fpu stack top after the push/pop, and in R8D the flags of the status word following a stack fault. It also writes
// to the host stack, at offset zero, an appropriate indefinite value when it detects a masked stack exception
// NOTE: we only support masked stack exceptions for now

Label no_stack_fault = m_a.newLabel(), exp_masked = m_a.newLabel();
XOR(R8D, R8D);
if constexpr (is_push) {
// detect stack overflow
MOVZX(EBX, MEMD16(RCX, FPU_DATA_FTOP));
SUB(EBX, 1);
AND(EBX, 7);
MOV(AX, MEMSD16(RCX, RBX, 0, CPU_CTX_FTAGS0));
CMP(AX, FPU_TAG_EMPTY);
BR_EQ(no_stack_fault);
}
else {
// detect stack underflow
MOVZX(EBX, MEMD16(RCX, FPU_DATA_FTOP));
ADD(EBX, 1);
AND(EBX, 7);
MOV(AX, MEMSD16(RCX, RBX, 0, CPU_CTX_FTAGS0));
CMP(AX, FPU_TAG_EMPTY);
BR_NE(no_stack_fault);
}

MOVZX(EAX, MEMD16(RCX, CPU_CTX_FCTRL));
TEST(EAX, FPU_EXP_INVALID);
BR_NE(exp_masked);
static const char *abort_msg = "Unmasked fpu stack exception not supported";
MOV(RCX, abort_msg);
MOV(RAX, &cpu_runtime_abort);
CALL(RAX); // won't return
INT3();
m_a.bind(exp_masked);
// stack fault exception masked, write an indefinite value, so that the fpu instr uses it
MOVZX(R8D, MEMD16(RCX, CPU_CTX_FSTATUS));
OR(R8D, FPU_FLG_IE | FPU_FLG_SF | (is_push ? (1 << FPU_C1_SHIFT) : (0 << FPU_C1_SHIFT)));

switch (instr_type)
{
case fpu_instr_t::integer8:
MOV(MEMD8(RSP, LOCAL_VARS_off(0)), FPU_INTEGER_INDEFINITE8);
break;

case fpu_instr_t::integer16:
MOV(MEMD16(RSP, LOCAL_VARS_off(0)), FPU_INTEGER_INDEFINITE16);
break;

case fpu_instr_t::integer32:
MOV(MEMD32(RSP, LOCAL_VARS_off(0)), FPU_INTEGER_INDEFINITE32);
break;

case fpu_instr_t::integer64:
MOV(MEMD64(RSP, LOCAL_VARS_off(0)), FPU_INTEGER_INDEFINITE64);
break;

case fpu_instr_t::float_:
MOV(MEMD64(RSP, LOCAL_VARS_off(0)), FPU_QNAN_FLOAT_INDEFINITE64);
MOV(MEMD16(RSP, LOCAL_VARS_off(1)), FPU_QNAN_FLOAT_INDEFINITE16);
break;

case fpu_instr_t::bcd:
MOV(MEMD64(RSP, LOCAL_VARS_off(0)), FPU_BCD_INDEFINITE64);
MOV(MEMD16(RSP, LOCAL_VARS_off(1)), FPU_BCD_INDEFINITE16);
break;

default:
LIB86CPU_ABORT();
}
m_a.bind(no_stack_fault);
}

void
lc86_jit::gen_fpu_exp_post_check()
{
Expand Down Expand Up @@ -5153,17 +5102,24 @@ lc86_jit::fld(decoded_instr *instr)
RAISEin0_t(EXP_NM);
}
else {
// XXX missing check for fpu unmasked exceptions
get_rm<OPNUM_SRC>(instr,
[this, instr](const op_info rm)
{
gen_fpu_stack_check<true, fpu_instr_t::float_>();
gen_set_host_fpu_ctx();
Label stack_fault = m_a.newLabel(), ok = m_a.newLabel();
MOV(MEMD64(RSP, LOCAL_VARS_off(0)), 0);
CALL_FPU_STACK_CHK(true, fpu_instr_t::float_);
CALL_FPU_SET_CTX();
MOV(EDX, instr->i.raw.modrm.rm);
MOV(EAX, sizeof(uint80_t));
MUL(DX);
TEST(MEMD64(RSP, LOCAL_VARS_off(0)), 0);
BR_NE(stack_fault);
FLD(MEMSD80(RCX, RAX, 0, CPU_CTX_R0));
gen_fpu_exp_post_check();
BR_UNCOND(ok);
m_a.bind(stack_fault);
FLD(MEMD32(RSP, LOCAL_VARS_off(0)));
m_a.bind(ok);
CALL_FPU_EXP_CHK();
MOV(EAX, sizeof(uint80_t));
ST_R16(FPU_DATA_FTOP, BX);
MUL(BX);
Expand All @@ -5177,31 +5133,31 @@ lc86_jit::fld(decoded_instr *instr)
case 0xD9:
LD_MEMs(SIZE32);
MOV(MEMD32(RSP, LOCAL_VARS_off(0)), EAX);
gen_fpu_stack_check<true, fpu_instr_t::float_>();
gen_set_host_fpu_ctx();
CALL_FPU_STACK_CHK(true, fpu_instr_t::float_);
CALL_FPU_SET_CTX();
FLD(MEMD32(RSP, LOCAL_VARS_off(0)));
break;

case 0xDD:
LD_MEMs(SIZE64);
MOV(MEMD64(RSP, LOCAL_VARS_off(0)), RAX);
gen_fpu_stack_check<true, fpu_instr_t::float_>();
gen_set_host_fpu_ctx();
CALL_FPU_STACK_CHK(true, fpu_instr_t::float_);
CALL_FPU_SET_CTX();
FLD(MEMD64(RSP, LOCAL_VARS_off(0)));
break;

case 0xDB:
LD_MEMs(SIZE80);
gen_fpu_stack_check<true, fpu_instr_t::float_>();
gen_set_host_fpu_ctx();
LD_MEM80i(0);
CALL_FPU_STACK_CHK(true, fpu_instr_t::float_);
CALL_FPU_SET_CTX();
FLD(MEMD80(RSP, LOCAL_VARS_off(0)));
break;

default:
LIB86CPU_ABORT();
}

gen_fpu_exp_post_check();
CALL_FPU_EXP_CHK();
MOV(EAX, sizeof(uint80_t));
ST_R16(FPU_DATA_FTOP, BX);
MUL(BX);
Expand Down
12 changes: 1 addition & 11 deletions lib86cpu/core/emitter/x64/jit.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,6 @@ struct op_info {
op_info(size_t val_, size_t bits_) : val(val_), bits(bits_) {}
};

enum class fpu_instr_t : int {
integer8,
integer16,
integer32,
integer64,
float_,
bcd,
};

class lc86_jit : public Target {
public:
Expand Down Expand Up @@ -237,7 +229,7 @@ class lc86_jit : public Target {
void load_reg(x86::Gp dst, size_t reg_offset, size_t size);
template<typename T>
void store_reg(T val, size_t reg_offset, size_t size);
void load_mem(uint8_t size, uint8_t is_priv);
void load_mem(uint8_t size, uint8_t is_priv, uint8_t stack_idx_80_128 = 0);
template<typename T, bool dont_write = false>
void store_mem(T val, uint8_t size, uint8_t is_priv);
void load_io(uint8_t size_mode);
Expand All @@ -253,8 +245,6 @@ class lc86_jit : public Target {
template<unsigned num, unsigned store_at = 0, bool write_esp = true>
void gen_stack_pop();
void gen_simd_mem_align_check();
template<bool is_push, fpu_instr_t instr_type>
void gen_fpu_stack_check();
void gen_fpu_exp_post_check();
void gen_set_host_fpu_ctx();
template<bool update_fdp>
Expand Down
Loading

0 comments on commit 3c25046

Please sign in to comment.