Skip to content

Commit

Permalink
Update mmu_get... and mmu_set... (#8290)
Browse files Browse the repository at this point in the history
These changes are needed to address bugs that can emerge with the improved optimization from the GCC 10.3 compiler.

Updated performance inline functions `mmu_get_uint8()`, ... and `mmu_set_uint8()`, ...  to comply with strict-aliasing rules. 
Without this change, stale data may be referenced. This issue was revealed in discussions on #8261 (comment) 

Changes to avoid over-optimization of 32-bit wide transfers from IRAM, turning into 8-bit or 16-bit transfers by the new GCC 10.3 compiler. This has been a reoccurring/tricky problem for me with the new compiler. 

So far referencing the 32-bit value loaded by way of an Extended ASM R/W output register has stopped the compiler from optimizing down to an 8-bit or 16-bit transfer.

Example:
```cpp
  uint32_t val;
  __builtin_memcpy(&val, v32, sizeof(uint32_t));
  asm volatile ("" :"+r"(val)); // inject 32-bit dependency
  ...
```

Updated example `irammem.ino`
* do a simple test of compliance to strict-aliasing rules
* For `mmu_get_uint8()`, added tests to evaluate if 32-bit wide transfers were converted to an 8-bit transfer.
  • Loading branch information
mhightower83 committed Oct 13, 2021
1 parent 9d024d1 commit b7a2f44
Show file tree
Hide file tree
Showing 4 changed files with 306 additions and 42 deletions.
126 changes: 90 additions & 36 deletions cores/esp8266/mmu_iram.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,24 @@
extern "C" {
#endif

//C This turns on range checking. Is this the value you want to trigger it?
// This turns on range checking.
#ifdef DEBUG_ESP_CORE
#define DEBUG_ESP_MMU
#endif

#if defined(CORE_MOCK)
#define ets_uart_printf(...) do {} while(false)
#define XCHAL_INSTRAM0_VADDR 0x40000000
#define XCHAL_INSTRAM1_VADDR 0x40100000
#define XCHAL_INSTROM0_VADDR 0x40200000
#else
#include <sys/config.h> // For config/core-isa.h
/*
Cautiously use XCHAL_..._VADDR values where possible.
While XCHAL_..._VADDR values in core-isa.h may define the Xtensa processor
CONFIG options, they are not always an indication of DRAM, IRAM, or ROM
size or position in the address space.
*/
#endif

/*
Expand Down Expand Up @@ -71,32 +82,34 @@ DBG_MMU_FLUSH(0)

static inline __attribute__((always_inline))
bool mmu_is_iram(const void *addr) {
#define IRAM_START 0x40100000UL
const uintptr_t iram_start = (uintptr_t)XCHAL_INSTRAM1_VADDR;
#ifndef MMU_IRAM_SIZE
#if defined(__GNUC__) && !defined(CORE_MOCK)
#warning "MMU_IRAM_SIZE was undefined, setting to 0x8000UL!"
#endif
#define MMU_IRAM_SIZE 0x8000UL
#define MMU_IRAM_SIZE 0x8000ul
#endif
#define IRAM_END (IRAM_START + MMU_IRAM_SIZE)
const uintptr_t iram_end = iram_start + MMU_IRAM_SIZE;

return (IRAM_START <= (uintptr_t)addr && IRAM_END > (uintptr_t)addr);
return (iram_start <= (uintptr_t)addr && iram_end > (uintptr_t)addr);
}

static inline __attribute__((always_inline))
bool mmu_is_dram(const void *addr) {
#define DRAM_START 0x3FF80000UL
#define DRAM_END 0x40000000UL
const uintptr_t dram_start = 0x3FFE8000ul;
// The start of the Boot ROM sits at the end of DRAM. 0x40000000ul;
const uintptr_t dram_end = (uintptr_t)XCHAL_INSTRAM0_VADDR;

return (DRAM_START <= (uintptr_t)addr && DRAM_END > (uintptr_t)addr);
return (dram_start <= (uintptr_t)addr && dram_end > (uintptr_t)addr);
}

static inline __attribute__((always_inline))
bool mmu_is_icache(const void *addr) {
#define ICACHE_START 0x40200000UL
#define ICACHE_END (ICACHE_START + 0x100000UL)
extern void _irom0_text_end(void);
const uintptr_t icache_start = (uintptr_t)XCHAL_INSTROM0_VADDR;
const uintptr_t icache_end = (uintptr_t)_irom0_text_end;

return (ICACHE_START <= (uintptr_t)addr && ICACHE_END > (uintptr_t)addr);
return (icache_start <= (uintptr_t)addr && icache_end > (uintptr_t)addr);
}

#ifdef DEBUG_ESP_MMU
Expand Down Expand Up @@ -127,90 +140,131 @@ bool mmu_is_icache(const void *addr) {
static inline __attribute__((always_inline))
uint8_t mmu_get_uint8(const void *p8) {
ASSERT_RANGE_TEST_READ(p8);
uint32_t val = (*(uint32_t *)((uintptr_t)p8 & ~0x3));
uint32_t pos = ((uintptr_t)p8 & 0x3) * 8;
// https://gist.github.com/shafik/848ae25ee209f698763cffee272a58f8#how-do-we-type-pun-correctly
// Comply with strict-aliasing rules. Using memcpy is a Standards suggested
// method for type punning. The compiler optimizer will replace the memcpy
// with an `l32i` instruction. Using __builtin_memcpy to ensure we get the
// effects of the compiler optimization and not some #define version of
// memcpy.
void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u);
uint32_t val;
__builtin_memcpy(&val, v32, sizeof(uint32_t));
// Use an empty ASM to reference the 32-bit value. This will block the
// compiler from immediately optimizing to an 8-bit or 16-bit load instruction
// against IRAM memory. (This approach was inspired by
// https://github.com/esp8266/Arduino/pull/7780#discussion_r548303374)
// This issue was seen when using a constant address with the GCC 10.3
// compiler.
// As a general practice, I think referencing by way of Extended ASM R/W
// output register will stop the the compiler from reloading the value later
// as 8-bit load from IRAM.
asm volatile ("" :"+r"(val)); // inject 32-bit dependency
uint32_t pos = ((uintptr_t)p8 & 3u) * 8u;
val >>= pos;
return (uint8_t)val;
}

static inline __attribute__((always_inline))
uint16_t mmu_get_uint16(const uint16_t *p16) {
ASSERT_RANGE_TEST_READ(p16);
uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3));
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)0x3u);
uint32_t val;
__builtin_memcpy(&val, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(val));
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
val >>= pos;
return (uint16_t)val;
}

static inline __attribute__((always_inline))
int16_t mmu_get_int16(const int16_t *p16) {
ASSERT_RANGE_TEST_READ(p16);
uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3));
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
uint32_t val;
__builtin_memcpy(&val, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(val));
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
val >>= pos;
return (int16_t)val;
}

static inline __attribute__((always_inline))
uint8_t mmu_set_uint8(void *p8, const uint8_t val) {
ASSERT_RANGE_TEST_WRITE(p8);
uint32_t pos = ((uintptr_t)p8 & 0x3) * 8;
uint32_t pos = ((uintptr_t)p8 & 3u) * 8u;
uint32_t sval = val << pos;
uint32_t valmask = 0x0FF << pos;
uint32_t valmask = 0x0FFu << pos;

void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u);
uint32_t ival;
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(ival));

uint32_t *p32 = (uint32_t *)((uintptr_t)p8 & ~0x3);
uint32_t ival = *p32;
ival &= (~valmask);
ival |= sval;
*p32 = ival;
/*
This 32-bit dependency injection does not appear to be needed with the
current GCC 10.3; however, that could change in the future versions. Or, I
may not have the right test for it to fail.
*/
asm volatile ("" :"+r"(ival));
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
return val;
}

static inline __attribute__((always_inline))
uint16_t mmu_set_uint16(uint16_t *p16, const uint16_t val) {
ASSERT_RANGE_TEST_WRITE(p16);
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
uint32_t sval = val << pos;
uint32_t valmask = 0x0FFFF << pos;
uint32_t valmask = 0x0FFFFu << pos;

void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
uint32_t ival;
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(ival));

uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3);
uint32_t ival = *p32;
ival &= (~valmask);
ival |= sval;
*p32 = ival;
asm volatile ("" :"+r"(ival));
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
return val;
}

static inline __attribute__((always_inline))
int16_t mmu_set_int16(int16_t *p16, const int16_t val) {
ASSERT_RANGE_TEST_WRITE(p16);
uint32_t sval = (uint16_t)val;
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
sval <<= pos;
uint32_t valmask = 0x0FFFF << pos;
uint32_t valmask = 0x0FFFFu << pos;

void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
uint32_t ival;
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(ival));

uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3);
uint32_t ival = *p32;
ival &= (~valmask);
ival |= sval;
*p32 = ival;
asm volatile ("" :"+r"(ival));
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
return val;
}

#if (MMU_IRAM_SIZE > 32*1024) && !defined(MMU_SEC_HEAP)
extern void _text_end(void);
#define MMU_SEC_HEAP mmu_sec_heap()
#define MMU_SEC_HEAP_SIZE mmu_sec_heap_size()

static inline __attribute__((always_inline))
void *mmu_sec_heap(void) {
uint32_t sec_heap = (uint32_t)_text_end + 32;
return (void *)(sec_heap &= ~7);
extern void _text_end(void);
uintptr_t sec_heap = (uintptr_t)_text_end + (uintptr_t)32u;
return (void *)(sec_heap &= ~(uintptr_t)7u);
}

static inline __attribute__((always_inline))
size_t mmu_sec_heap_size(void) {
return (size_t)0xC000UL - ((size_t)mmu_sec_heap() - 0x40100000UL);
return (size_t)0xC000ul - ((uintptr_t)mmu_sec_heap() - (uintptr_t)XCHAL_INSTRAM1_VADDR);
}
#endif

Expand Down
12 changes: 9 additions & 3 deletions libraries/esp8266/examples/IramReserve/IramReserve.ino
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
#include <umm_malloc/umm_malloc.h>
#if defined(UMM_HEAP_IRAM)

#if defined(CORE_MOCK)
#define XCHAL_INSTRAM1_VADDR 0x40100000
#else
#include <sys/config.h> // For config/core-isa.h
#endif

// durable - as in long life, persisting across reboots.
struct durable {
uint32_t bootCounter;
Expand All @@ -30,7 +36,7 @@ struct durable {
#define IRAM_RESERVE_SZ ((sizeof(struct durable) + 7UL) & ~7UL)

// Position its address just above the reduced 2nd Heap.
#define IRAM_RESERVE (0x40100000UL + 0xC000UL - IRAM_RESERVE_SZ)
#define IRAM_RESERVE ((uintptr_t)XCHAL_INSTRAM1_VADDR + 0xC000UL - IRAM_RESERVE_SZ)

// Define a reference with the right properties to make access easier.
#define DURABLE ((struct durable *)IRAM_RESERVE)
Expand Down Expand Up @@ -100,9 +106,9 @@ extern "C" void umm_init_iram(void) {
adjustments and checksums. These can affect the persistence of data across
reboots.
*/
uint32_t sec_heap = (uint32_t)_text_end + 32;
uintptr_t sec_heap = (uintptr_t)_text_end + 32;
sec_heap &= ~7;
size_t sec_heap_sz = 0xC000UL - (sec_heap - 0x40100000UL);
size_t sec_heap_sz = 0xC000UL - (sec_heap - (uintptr_t)XCHAL_INSTRAM1_VADDR);
sec_heap_sz -= IRAM_RESERVE_SZ; // Shrink IRAM heap
if (0xC000UL > sec_heap_sz) {

Expand Down
8 changes: 7 additions & 1 deletion libraries/esp8266/examples/MMU48K/MMU48K.ino
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
#include <umm_malloc/umm_malloc.h>
#include <umm_malloc/umm_heap_select.h>

#if defined(CORE_MOCK)
#define XCHAL_INSTRAM1_VADDR 0x40100000
#else
#include <sys/config.h> // For config/core-isa.h
#endif

uint32_t timed_byte_read(char *pc, uint32_t * o);
uint32_t timed_byte_read2(char *pc, uint32_t * o);
int divideA_B(int a, int b);
Expand Down Expand Up @@ -102,7 +108,7 @@ void print_mmu_status(Print& oStream) {
#ifdef MMU_IRAM_SIZE
oStream.printf_P(PSTR(" IRAM Size: %u"), MMU_IRAM_SIZE);
oStream.println();
const uint32_t iram_free = MMU_IRAM_SIZE - (uint32_t)((uintptr_t)_text_end - 0x40100000UL);
const uint32_t iram_free = MMU_IRAM_SIZE - (uint32_t)((uintptr_t)_text_end - (uintptr_t)XCHAL_INSTRAM1_VADDR);
oStream.printf_P(PSTR(" IRAM free: %u"), iram_free);
oStream.println();
#endif
Expand Down
Loading

0 comments on commit b7a2f44

Please sign in to comment.