Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update mmu_get... and mmu_set... #8290

Merged
merged 10 commits into from
Oct 13, 2021
89 changes: 65 additions & 24 deletions cores/esp8266/mmu_iram.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,73 +127,114 @@ bool mmu_is_icache(const void *addr) {
static inline __attribute__((always_inline))
uint8_t mmu_get_uint8(const void *p8) {
ASSERT_RANGE_TEST_READ(p8);
uint32_t val = (*(uint32_t *)((uintptr_t)p8 & ~0x3));
uint32_t pos = ((uintptr_t)p8 & 0x3) * 8;
// https://gist.github.com/shafik/848ae25ee209f698763cffee272a58f8#how-do-we-type-pun-correctly
// Comply with strict-aliasing rules. Using memcpy is a Standards suggested
// method for type punning. The compiler optimizer will replace the memcpy
// with an `l32i` instruction. Using __builtin_memcpy to ensure we get the
// effects of the compiler optimization and not some #define version of
// memcpy.
void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u);
uint32_t val;
__builtin_memcpy(&val, v32, sizeof(uint32_t));
// Use an empty ASM to reference the 32-bit value. This will block the
// compiler from immediately optimizing to an 8-bit or 16-bit load instruction
// against IRAM memory. (This approach was inspired by
// https://github.com/esp8266/Arduino/pull/7780#discussion_r548303374)
// This issue was seen when using a constant address with the GCC 10.3
// compiler.
// As a general practice, I think referencing by way of Extended ASM R/W
// output register will stop the the compiler from reloading the value later
// as 8-bit load from IRAM.
asm volatile ("" :"+r"(val)); // inject 32-bit dependency
uint32_t pos = ((uintptr_t)p8 & 3u) * 8u;
val >>= pos;
return (uint8_t)val;
}

static inline __attribute__((always_inline))
uint16_t mmu_get_uint16(const uint16_t *p16) {
ASSERT_RANGE_TEST_READ(p16);
uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3));
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)0x3u);
uint32_t val;
__builtin_memcpy(&val, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(val));
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
val >>= pos;
return (uint16_t)val;
}

static inline __attribute__((always_inline))
int16_t mmu_get_int16(const int16_t *p16) {
ASSERT_RANGE_TEST_READ(p16);
uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3));
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
uint32_t val;
__builtin_memcpy(&val, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(val));
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
val >>= pos;
return (int16_t)val;
}

static inline __attribute__((always_inline))
uint8_t mmu_set_uint8(void *p8, const uint8_t val) {
ASSERT_RANGE_TEST_WRITE(p8);
uint32_t pos = ((uintptr_t)p8 & 0x3) * 8;
uint32_t pos = ((uintptr_t)p8 & 3u) * 8u;
uint32_t sval = val << pos;
uint32_t valmask = 0x0FF << pos;
uint32_t valmask = 0x0FFu << pos;

void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u);
uint32_t ival;
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(ival));

uint32_t *p32 = (uint32_t *)((uintptr_t)p8 & ~0x3);
uint32_t ival = *p32;
ival &= (~valmask);
ival |= sval;
*p32 = ival;
/*
This 32-bit dependency injection does not appear to be needed with the
current GCC 10.3; however, that could change in the future versions. Or, I
may not have the right test for it to fail.
*/
asm volatile ("" :"+r"(ival));
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
return val;
}

static inline __attribute__((always_inline))
uint16_t mmu_set_uint16(uint16_t *p16, const uint16_t val) {
ASSERT_RANGE_TEST_WRITE(p16);
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
uint32_t sval = val << pos;
uint32_t valmask = 0x0FFFF << pos;
uint32_t valmask = 0x0FFFFu << pos;

void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
uint32_t ival;
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(ival));

uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3);
uint32_t ival = *p32;
ival &= (~valmask);
ival |= sval;
*p32 = ival;
asm volatile ("" :"+r"(ival));
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
return val;
}

static inline __attribute__((always_inline))
int16_t mmu_set_int16(int16_t *p16, const int16_t val) {
ASSERT_RANGE_TEST_WRITE(p16);
uint32_t sval = (uint16_t)val;
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
sval <<= pos;
uint32_t valmask = 0x0FFFF << pos;
uint32_t valmask = 0x0FFFFu << pos;

void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
uint32_t ival;
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(ival));

uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3);
uint32_t ival = *p32;
ival &= (~valmask);
ival |= sval;
*p32 = ival;
asm volatile ("" :"+r"(ival));
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
return val;
}

Expand All @@ -204,13 +245,13 @@ extern void _text_end(void);

static inline __attribute__((always_inline))
void *mmu_sec_heap(void) {
uint32_t sec_heap = (uint32_t)_text_end + 32;
return (void *)(sec_heap &= ~7);
uintptr_t sec_heap = (uintptr_t)_text_end + (uintptr_t)32u;
return (void *)(sec_heap &= ~(uintptr_t)7u);
}

static inline __attribute__((always_inline))
size_t mmu_sec_heap_size(void) {
return (size_t)0xC000UL - ((size_t)mmu_sec_heap() - 0x40100000UL);
return (size_t)0xC000ul - ((uintptr_t)mmu_sec_heap() - (uintptr_t)0x40100000ul);
mcspr marked this conversation as resolved.
Show resolved Hide resolved
}
#endif

Expand Down
202 changes: 200 additions & 2 deletions libraries/esp8266/examples/irammem/irammem.ino
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,204 @@
#define ETS_PRINTF ets_uart_printf
#endif

/*
Verify mmu_get_uint16()'s compliance with strict-aliasing rules under
different optimizations.
*/
Comment on lines +17 to +20
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's already here, but would this and the rest of the file make sense as a device test file at /tests/device/test_irammem/ like there's already a umm malloc one? And given the structure of the things below with success and failures

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know much about how the optimizations might vary across processor vendors, my thinking was this way it is tested with a matching vendor/version of the ESP8266 GCC.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The basic idea is to have a separate set of 'tests' that have a clearly defined success and failures, which is useful to have grouped together instead of remembering to run the example code to sanity-check certain Core implementation details. 'Device' is just the board running the test program, and... we are fighting the c++ object lifetime assumptions done by the compiler, not the machine code specifics?

Just to note, idk if it actually plays correctly with something supposed to be hw-only and not mixed with MOCK tests


#pragma GCC push_options
// reference
#pragma GCC optimize("O0") // We expect -O0 to generate the correct results
__attribute__((noinline))
void aliasTestReference(uint16_t *x) {
// Without adhearance to strict-aliasing, this sequence of code would fail
// when optimized by GCC Version 10.3
size_t len = 3;
for (size_t u = 0; u < len; u++) {
uint16_t x1 = mmu_get_uint16(&x[0]);
for (size_t v = 0; v < len; v++) {
x[v] = mmu_get_uint16(&x[v]) + x1;
}
}
}
// Tests
#pragma GCC optimize("Os")
__attribute__((noinline))
void aliasTestOs(uint16_t *x) {
size_t len = 3;
for (size_t u = 0; u < len; u++) {
uint16_t x1 = mmu_get_uint16(&x[0]);
for (size_t v = 0; v < len; v++) {
x[v] = mmu_get_uint16(&x[v]) + x1;
}
}
}
#pragma GCC optimize("O2")
__attribute__((noinline))
void aliasTestO2(uint16_t *x) {
size_t len = 3;
for (size_t u = 0; u < len; u++) {
uint16_t x1 = mmu_get_uint16(&x[0]);
for (size_t v = 0; v < len; v++) {
x[v] = mmu_get_uint16(&x[v]) + x1;
}
}
}
#pragma GCC optimize("O3")
__attribute__((noinline))
void aliasTestO3(uint16_t *x) {
size_t len = 3;
for (size_t u = 0; u < len; u++) {
uint16_t x1 = mmu_get_uint16(&x[0]);
for (size_t v = 0; v < len; v++) {
x[v] = mmu_get_uint16(&x[v]) + x1;
}
}
}

// Evaluate if optomizer may have changed 32-bit access to 8-bit.
// 8-bit access will take longer as it will be processed thought
// the exception handler. For this case the -O0 version will appear faster.
#pragma GCC optimize("O0")
__attribute__((noinline)) IRAM_ATTR
uint32_t timedRead_Reference(uint8_t *res) {
// This test case was verified with GCC 10.3
// There is a code case that can result in 32-bit wide IRAM load from memory
// being optimized down to an 8-bit memory access. In this test case we need
// to supply a constant IRAM address that is not 0 when anded with 3u.
// This section verifies that the workaround implimented by the inline
// function mmu_get_uint8() is preventing this. See comments for function
// mmu_get_uint8(() in mmu_iram.h for more details.
const uint8_t *x = (const uint8_t *)0x40100003ul;
uint32_t b = ESP.getCycleCount();
*res = mmu_get_uint8(x);
return ESP.getCycleCount() - b;
}
#pragma GCC optimize("Os")
__attribute__((noinline)) IRAM_ATTR
uint32_t timedRead_Os(uint8_t *res) {
const uint8_t *x = (const uint8_t *)0x40100003ul;
uint32_t b = ESP.getCycleCount();
*res = mmu_get_uint8(x);
return ESP.getCycleCount() - b;
}
#pragma GCC optimize("O2")
__attribute__((noinline)) IRAM_ATTR
uint32_t timedRead_O2(uint8_t *res) {
const uint8_t *x = (const uint8_t *)0x40100003ul;
uint32_t b = ESP.getCycleCount();
*res = mmu_get_uint8(x);
return ESP.getCycleCount() - b;
}
#pragma GCC optimize("O3")
__attribute__((noinline)) IRAM_ATTR
uint32_t timedRead_O3(uint8_t *res) {
const uint8_t *x = (const uint8_t *)0x40100003ul;
uint32_t b = ESP.getCycleCount();
*res = mmu_get_uint8(x);
return ESP.getCycleCount() - b;
}
#pragma GCC pop_options

bool test4_32bit_loads() {
bool result = true;
uint8_t res;
uint32_t cycle_count_ref, cycle_count;
Serial.printf("\r\nFor mmu_get_uint8, verify that 32-bit wide IRAM access is preserved across different optimizations:\r\n");
cycle_count_ref = timedRead_Reference(&res);
/*
If the optimizer (for options -Os, -O2, and -O3) replaces the 32-bit wide
IRAM access with an 8-bit, the exception handler will get invoked on memory
reads. The total execution time will show a significant increase when
compared to the reference (option -O0).
*/
Serial.printf(" Option -O0, cycle count %5u - reference\r\n", cycle_count_ref);
cycle_count = timedRead_Os(&res);
Serial.printf(" Option -Os, cycle count %5u ", cycle_count);
if (cycle_count_ref > cycle_count) {
Serial.printf("- passed\r\n");
} else {
result = false;
Serial.printf("- failed\r\n");
}
cycle_count = timedRead_O2(&res);
Serial.printf(" Option -O2, cycle count %5u ", cycle_count);
if (cycle_count_ref > cycle_count) {
Serial.printf("- passed\r\n");
} else {
result = false;
Serial.printf("- failed\r\n");
}
cycle_count = timedRead_O3(&res);
Serial.printf(" Option -O3, cycle count %5u ", cycle_count);
if (cycle_count_ref > cycle_count) {
Serial.printf("- passed\r\n");
} else {
result = false;
Serial.printf("- failed\r\n");
}
return result;
}

void printPunFail(uint16_t *ref, uint16_t *x, size_t sz) {
Serial.printf(" Expected:");
for (size_t i = 0; i < sz; i++) {
Serial.printf(" %3u", ref[i]);
}
Serial.printf("\r\n Got: ");
for (size_t i = 0; i < sz; i++) {
Serial.printf(" %3u", x[i]);
}
Serial.printf("\r\n");
}

bool testPunning() {
bool result = true;
// Get reference result for verifing test
alignas(alignof(uint32_t)) uint16_t x_ref[] = {1, 2, 3, 0};
mcspr marked this conversation as resolved.
Show resolved Hide resolved
aliasTestReference(x_ref); // -O0
Serial.printf("mmu_get_uint16() strict-aliasing tests with different optimizations:\r\n");

{
alignas(alignof(uint32_t)) uint16_t x[] = {1, 2, 3, 0};
aliasTestOs(x);
Serial.printf(" Option -Os ");
if (0 == memcmp(x_ref, x, sizeof(x_ref))) {
Serial.printf("- passed\r\n");
} else {
result = false;
Serial.printf("- failed\r\n");
printPunFail(x_ref, x, sizeof(x_ref) / sizeof(uint16_t));
}
}
{
alignas(alignof(uint32_t)) uint16_t x[] = {1, 2, 3, 0};
aliasTestO2(x);
Serial.printf(" Option -O2 ");
if (0 == memcmp(x_ref, x, sizeof(x_ref))) {
Serial.printf("- passed\r\n");
} else {
result = false;
Serial.printf("- failed\r\n");
printPunFail(x_ref, x, sizeof(x_ref) / sizeof(uint16_t));
}
}
{
alignas(alignof(uint32_t)) uint16_t x[] = {1, 2, 3, 0};
aliasTestO3(x);
Serial.printf(" Option -O3 ");
if (0 == memcmp(x_ref, x, sizeof(x_ref))) {
Serial.printf("- passed\r\n");
} else {
result = false;
Serial.printf("- failed\r\n");
printPunFail(x_ref, x, sizeof(x_ref) / sizeof(uint16_t));
}
}
return result;
}


uint32_t cyclesToRead_nKx32(int n, unsigned int *x, uint32_t *res) {
uint32_t b = ESP.getCycleCount();
uint32_t sum = 0;
Expand Down Expand Up @@ -95,7 +293,6 @@ uint32_t cyclesToWrite_nKx8(int n, unsigned char*x) {
}

// Compare with Inline

uint32_t cyclesToRead_nKx16_viaInline(int n, unsigned short *x, uint32_t *res) {
uint32_t b = ESP.getCycleCount();
uint32_t sum = 0;
Expand Down Expand Up @@ -159,6 +356,7 @@ uint32_t cyclesToWrite_nKx8_viaInline(int n, unsigned char*x) {
return ESP.getCycleCount() - b;
}


bool perfTest_nK(int nK, uint32_t *mem, uint32_t *imem) {
uint32_t res, verify_res;
uint32_t t;
Expand Down Expand Up @@ -317,7 +515,7 @@ void setup() {
Serial.println();


if (perfTest_nK(1, mem, imem)) {
if (perfTest_nK(1, mem, imem) && testPunning() && test4_32bit_loads()) {
Serial.println();
} else {
Serial.println("\r\n*******************************");
Expand Down