Saturday, November 20, 2010

ARMv7 kernel with L1 cache disabled.

I was (well, still am) hunting down some memory corruptions inside our kernel, and figured removing as many of possible culprits would be a good idea. Given the different PL310 cache controller errata I figured I might as well disable this guy and see if that helps stability somewhat. Doing that is as easy as disabling CONFIG_CACHE_L2X0. Even though I obviously wasn't going to play with disabling L1 (after all, if that's where your problems are, you have bigger issues...), once I saw the CONFIG_CPU_ICACHE_DISABLE/CONFIG_CPU_DCACHE_DISABLE options I knew I had to try them out. Even if just to see our ARM target crawl.

Of course, after building with that I booted to a hard hang. I tried with just CONFIG_CPU_ICACHE_DISABLE, which worked (glacially), so it was disabling the d-cache that was hosing me. I wasn't going to let a measly kernel config option defeat me, so there went my Friday night :-)... It took me a while to figure out it was actually hanging inside printk(). On a spin_lock. Locking and atomic operations are implemented on Linux with the LDREX/STREX instructions on ARMv6 and above. If you look at the description of these instructions, they involve an exclusive monitor, which is part of the Data Cache Unit (DCU) for L1, and my L2 is off (not that it would do me any good - PL310 doesn't contain an exclusive monitor). So the STREX always fails, and the lock appears taken.  Of course, spinlocks are only used with SMP, and SMP is only supported in Linux on ARMv6 and above (which added support for STREX and LDREX), so since I didn't feel like implementing raw_spin_lock with the SWP instruction (deprecated on >= ARMv6),  disabling SMP was pretty much the obvious choice at 1 AM. After that I needed to enable pre-ARMv6 variants for mutexes, locks, atomic operations, bit operations and __xchg/cmpxchg. And now it boots.... Of course my user space, being compiled for ARMv7, expects functional LDREX/STREX, and so it hangs there...in the init process.



debug>
debug>
debug>
debug> regs
 r0 00000002  r1 00020990  r2 00000002  r3 00000001
 r4 00020990  r5 00000000  r6 00000002  r7 00000000
 r8 beae1dc8  r9 beac2000 r10 00000000 r11 00000000  mode USR
 ip 00000002  sp beae1d90  lr 00015f34  pc 000087c8  cpsr 20000010
debug> bt
pid: 1  comm: init
 r0 00000002  r1 00020990  r2 00000002  r3 00000001
 r4 00020990  r5 00000000  r6 00000002  r7 00000000
 r8 beae1dc8  r9 beac2000 r10 00000000 r11 00000000  mode USR
 ip 00000002  sp beae1d90  lr 00015f34  pc 000087c8  cpsr 20000010
  invalid frame pointer fffffff4
debug>


Here is a patch to 2.6.36 if anyone wants to play around... The obvious improvement would be to
add pre-ARMv6-style spinlock support.

From 6a7c561b54dc2f1bab7bc6eff8bd365850f70f01 Mon Sep 17 00:00:00 2001
From: Andrei Warkentin <andreiw@motorola.com>
Date: Sat, 20 Nov 2010 02:55:32 -0600
Subject: [PATCH] ARMv7: Allow running with L1 cache disabled.

Makes CONFIG_CPU_DCACHE_DISABLE work, although it requires
!SMP (spinlocks use LDREX/STREX - fix that and you could
get SMP).

Change-Id: Iebf82f0afec0ae7474e46406d3f3ab1450e412c1
Signed-off-by: Andrei Warkentin <andreiw@motorola.com>
---
 arch/arm/include/asm/atomic.h     |    2 +-
 arch/arm/include/asm/cacheflush.h |    2 ++
 arch/arm/include/asm/locks.h      |    2 +-
 arch/arm/include/asm/mutex.h      |    2 +-
 arch/arm/include/asm/system.h     |    8 ++++----
 arch/arm/lib/bitops.h             |    2 +-
 arch/arm/mm/Kconfig               |    2 +-
 arch/arm/mm/cache-v7.S            |   22 ++++++++++++++++++++++
 arch/arm/mm/proc-v7.S             |    8 ++++++++
 9 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 7e79503..16d92e8 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -27,7 +27,7 @@
 #define atomic_read(v) (*(volatile int *)&(v)->counter)
 #define atomic_set(v,i) (((v)->counter) = (i))
 
-#if __LINUX_ARM_ARCH__ >= 6
+#if __LINUX_ARM_ARCH__ >= 6 && !defined(CONFIG_CPU_DCACHE_DISABLE)
 
 /*
  * ARMv6 UP and SMP safe atomic ops.  We use load exclusive and
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 5078dc6..30c5191 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -368,6 +368,7 @@ extern void flush_dcache_page(struct page *);
 
 static inline void __flush_icache_all(void)
 {
+#ifndef CONFIG_CPU_ICACHE_DISABLE
 #ifdef CONFIG_ARM_ERRATA_411920
  extern void v6_icache_inval_all(void);
  v6_icache_inval_all();
@@ -380,6 +381,7 @@ static inline void __flush_icache_all(void)
      :
      : "r" (0));
 #endif
+#endif /* CONFIG_CPU_ICACHE_DISABLE  */
 }
 static inline void flush_kernel_vmap_range(void *addr, int size)
 {
diff --git a/arch/arm/include/asm/locks.h b/arch/arm/include/asm/locks.h
index ef4c897..82df166 100644
--- a/arch/arm/include/asm/locks.h
+++ b/arch/arm/include/asm/locks.h
@@ -12,7 +12,7 @@
 #ifndef __ASM_PROC_LOCKS_H
 #define __ASM_PROC_LOCKS_H
 
-#if __LINUX_ARM_ARCH__ >= 6
+#if __LINUX_ARM_ARCH__ >= 6 && !defined(CONFIG_CPU_DCACHE_DISABLE)
 
 #define __down_op(ptr,fail)   \
  ({     \
diff --git a/arch/arm/include/asm/mutex.h b/arch/arm/include/asm/mutex.h
index 93226cf..bcbca2c 100644
--- a/arch/arm/include/asm/mutex.h
+++ b/arch/arm/include/asm/mutex.h
@@ -8,7 +8,7 @@
 #ifndef _ASM_MUTEX_H
 #define _ASM_MUTEX_H
 
-#if __LINUX_ARM_ARCH__ < 6
+#if __LINUX_ARM_ARCH__ < 6 || defined(CONFIG_CPU_DCACHE_DISABLE)
 /* On pre-ARMv6 hardware the swp based implementation is the most efficient. */
 # include <asm-generic/mutex-xchg.h>

 #else
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
index 549c978..dc8a127 100644
--- a/arch/arm/include/asm/system.h
+++ b/arch/arm/include/asm/system.h
@@ -254,14 +254,14 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
 #ifdef swp_is_buggy
  unsigned long flags;
 #endif
-#if __LINUX_ARM_ARCH__ >= 6
+#if __LINUX_ARM_ARCH__ >= 6 && !defined(CONFIG_CPU_DCACHE_DISABLE)
  unsigned int tmp;
 #endif
 
  smp_mb();
 
  switch (size) {
-#if __LINUX_ARM_ARCH__ >= 6
+#if __LINUX_ARM_ARCH__ >= 6 && !defined(CONFIG_CPU_DCACHE_DISABLE)
  case 1:
   asm volatile("@ __xchg1\n"
   "1: ldrexb %0, [%3]\n"
@@ -329,7 +329,7 @@ extern void enable_hlt(void);
 
 #include <asm-generic/cmpxchg-local.h>

 
-#if __LINUX_ARM_ARCH__ < 6
+#if __LINUX_ARM_ARCH__ < 6 || defined(CONFIG_CPU_DCACHE_DISABLE)
 
 #ifdef CONFIG_SMP
 #error "SMP is not supported on this platform"
@@ -348,7 +348,7 @@ extern void enable_hlt(void);
 #include <asm-generic/cmpxchg.h>
 #endif
 
-#else /* __LINUX_ARM_ARCH__ >= 6 */
+#else /* __LINUX_ARM_ARCH__ >= 6  && !CONFIG_CPU_DCACHE_DISABLE */
 
 extern void __bad_cmpxchg(volatile void *ptr, int size);
 
diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h
index d422529..271a452 100644
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -1,5 +1,5 @@
 
-#if __LINUX_ARM_ARCH__ >= 6 && defined(CONFIG_CPU_32v6K)
+#if __LINUX_ARM_ARCH__ >= 6 && defined(CONFIG_CPU_32v6K) && !CONFIG_CPU_DCACHE_DISABLE
  .macro bitop, instr
  mov r2, #1
  and r3, r0, #7  @ Get bit offset
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index cc6f9d6..6aad450 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -671,7 +671,7 @@ config CPU_ICACHE_DISABLE
 
 config CPU_DCACHE_DISABLE
  bool "Disable D-Cache (C-bit)"
- depends on CPU_CP15
+ depends on CPU_CP15 && !SMP
  help
    Say Y here to disable the processor data cache. Unless
    you have a reason not to or are unsure, say N.
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index 37c8157..9b3f6a7 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -28,6 +28,7 @@
  */
 ENTRY(v7_flush_dcache_all)
  dmb     @ ensure ordering with previous memory accesses
+#ifndef CONFIG_CPU_DCACHE_DISABLE
  mrc p15, 1, r0, c0, c0, 1  @ read clidr
  ands r3, r0, #0x7000000  @ extract loc from clidr
  mov r3, r3, lsr #23   @ left align loc bit field
@@ -72,6 +73,7 @@ finished:
  mcr p15, 2, r10, c0, c0, 0  @ select current cache level in cssr
  dsb
  isb
+#endif /* CONFIG_CPU_DCACHE_DISABLE */
  mov pc, lr
 ENDPROC(v7_flush_dcache_all)
 
@@ -91,11 +93,13 @@ ENTRY(v7_flush_kern_cache_all)
  THUMB( stmfd sp!, {r4-r7, r9-r11, lr} )
  bl v7_flush_dcache_all
  mov r0, #0
+#ifndef CONFIG_CPU_ICACHE_DISABLE
 #ifdef CONFIG_SMP
  mcr p15, 0, r0, c7, c1, 0  @ invalidate I-cache inner shareable
 #else
  mcr p15, 0, r0, c7, c5, 0  @ I+BTB cache invalidate
 #endif
+#endif /* CONFIG_CPU_ICACHE_DISABLE */
  ARM( ldmfd sp!, {r4-r5, r7, r9-r11, lr} )
  THUMB( ldmfd sp!, {r4-r7, r9-r11, lr} )
  mov pc, lr
@@ -163,21 +167,31 @@ ENTRY(v7_coherent_user_range)
  sub r3, r2, #1
  bic r0, r0, r3
 1:
+#ifndef CONFIG_CPU_DCACHE_DISABLE
  USER( mcr p15, 0, r0, c7, c11, 1 ) @ clean D line to the point of unification
  dsb
+#endif /* CONFIG_CPU_DCACHE_DISABLE */
+#ifndef CONFIG_CPU_ICACHE_DISABLE
  USER( mcr p15, 0, r0, c7, c5, 1 ) @ invalidate I line
+#endif /* CONFIG_CPU_ICACHE_DISABLE */
  add r0, r0, r2
 2:
  cmp r0, r1
  blo 1b
  mov r0, #0
+#ifndef CONFIG_CPU_BPREDICT_DISABLE
 #ifdef CONFIG_SMP
  mcr p15, 0, r0, c7, c1, 6  @ invalidate BTB Inner Shareable
 #else
  mcr p15, 0, r0, c7, c5, 6  @ invalidate BTB
 #endif
+#endif /* CONFIG_CPU_BPREDICT_DISABLE */
+#if !defined(CONFIG_CPU_DCACHE_DISABLE) || \
+    !defined(CONFIG_CPU_ICACHE_DISABLE) || \
+    !defined(CONFIG_CPU_BPREDICT_DISABLE)
  dsb
  isb
+#endif
  mov pc, lr
 
 /*
@@ -203,6 +217,7 @@ ENDPROC(v7_coherent_user_range)
  * - size - region size
  */
 ENTRY(v7_flush_kern_dcache_area)
+#ifndef CONFIG_CPU_DCACHE_DISABLE
  dcache_line_size r2, r3
  add r1, r0, r1
 1:
@@ -211,6 +226,7 @@ ENTRY(v7_flush_kern_dcache_area)
  cmp r0, r1
  blo 1b
  dsb
+#endif  /* CONFIG_CPU_DCACHE_DISABLE */
  mov pc, lr
 ENDPROC(v7_flush_kern_dcache_area)
 
@@ -225,6 +241,7 @@ ENDPROC(v7_flush_kern_dcache_area)
  * - end     - virtual end address of region
  */
 v7_dma_inv_range:
+#ifndef CONFIG_CPU_DCACHE_DISABLE
  dcache_line_size r2, r3
  sub r3, r2, #1
  tst r0, r3
@@ -240,6 +257,7 @@ v7_dma_inv_range:
  cmp r0, r1
  blo 1b
  dsb
+#endif /* CONFIG_CPU_DCACHE_DISABLE */
  mov pc, lr
 ENDPROC(v7_dma_inv_range)
 
@@ -249,6 +267,7 @@ ENDPROC(v7_dma_inv_range)
  * - end     - virtual end address of region
  */
 v7_dma_clean_range:
+#ifndef CONFIG_CPU_DCACHE_DISABLE
  dcache_line_size r2, r3
  sub r3, r2, #1
  bic r0, r0, r3
@@ -258,6 +277,7 @@ v7_dma_clean_range:
  cmp r0, r1
  blo 1b
  dsb
+#endif /* CONFIG_CPU_DCACHE_DISABLE */
  mov pc, lr
 ENDPROC(v7_dma_clean_range)
 
@@ -267,6 +287,7 @@ ENDPROC(v7_dma_clean_range)
  * - end     - virtual end address of region
  */
 ENTRY(v7_dma_flush_range)
+#ifndef CONFIG_CPU_DCACHE_DISABLE
  dcache_line_size r2, r3
  sub r3, r2, #1
  bic r0, r0, r3
@@ -276,6 +297,7 @@ ENTRY(v7_dma_flush_range)
  cmp r0, r1
  blo 1b
  dsb
+#endif /* CONFIG_CPU_DCACHE_DISABLE */
  mov pc, lr
 ENDPROC(v7_dma_flush_range)
 
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index 8cdf3bf..b24ffef 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -80,6 +80,7 @@ ENTRY(cpu_v7_do_idle)
 ENDPROC(cpu_v7_do_idle)
 
 ENTRY(cpu_v7_dcache_clean_area)
+#ifndef CONFIG_CPU_DCACHE_DISABLE
 #ifndef TLB_CAN_READ_FROM_L1_CACHE
  dcache_line_size r2, r3
 1: mcr p15, 0, r0, c7, c10, 1  @ clean D entry
@@ -88,6 +89,7 @@ ENTRY(cpu_v7_dcache_clean_area)
  bhi 1b
  dsb
 #endif
+#endif /* CONFIG_CPU_DCACHE_DISABLE */
  mov pc, lr
 ENDPROC(cpu_v7_dcache_clean_area)
 
@@ -106,9 +108,11 @@ ENTRY(cpu_v7_switch_mm)
  mov r2, #0
  ldr r1, [r1, #MM_CONTEXT_ID] @ get mm->context.id

  orr r0, r0, #TTB_FLAGS
+#ifndef CONFIG_CPU_BPREDICT_DISABLE
 #ifdef CONFIG_ARM_ERRATA_430973
  mcr p15, 0, r2, c7, c5, 6  @ flush BTAC/BTB
 #endif
+#endif /* CONFIG_CPU_BPREDICT_DISABLE */
  mcr p15, 0, r2, c13, c0, 1  @ set reserved context ID
  isb
 1: mcr p15, 0, r0, c2, c0, 0  @ set TTB 0
@@ -160,7 +164,9 @@ ENTRY(cpu_v7_set_pte_ext)
  moveq r3, #0
 
  str r3, [r0]
+#ifndef CONFIG_CPU_DCACHE_DISABLE
  mcr p15, 0, r0, c7, c10, 1  @ flush_pte
+#endif /* CONFIG_CPU_DCACHE_DISABLE */
 #endif
  mov pc, lr
 ENDPROC(cpu_v7_set_pte_ext)
@@ -264,7 +270,9 @@ __v7_setup:
 
 3: mov r10, #0
 #ifdef HARVARD_CACHE
+#ifndef CONFIG_CPU_ICACHE_DISABLE
  mcr p15, 0, r10, c7, c5, 0  @ I+BTB cache invalidate
+#endif /* CONFIG_CPU_ICACHE_DISABLE */
 #endif
  dsb
 #ifdef CONFIG_MMU
-- 
1.7.0.4

No comments:

Post a Comment