]> git.baikalelectronics.ru Git - kernel.git/commitdiff
powerpc/64s: Improve RFI L1-D cache flush fallback
authorNicholas Piggin <npiggin@gmail.com>
Wed, 17 Jan 2018 13:58:18 +0000 (23:58 +1000)
committerMichael Ellerman <mpe@ellerman.id.au>
Tue, 23 Jan 2018 05:16:33 +0000 (16:16 +1100)
The fallback RFI flush is used when firmware does not provide a way
to flush the cache. It's a "displacement flush" that evicts useful
data by displacing it with an uninteresting buffer.

The flush has to take care to work with implementation specific cache
replacment policies, so the recipe has been in flux. The initial
slow but conservative approach is to touch all lines of a congruence
class, with dependencies between each load. It has since been
determined that a linear pattern of loads without dependencies is
sufficient, and is significantly faster.

Measuring the speed of a null syscall with RFI fallback flush enabled
gives the relative improvement:

P8 - 1.83x
P9 - 1.75x

The flush also becomes simpler and more adaptable to different cache
geometries.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
arch/powerpc/include/asm/paca.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/setup_64.c
arch/powerpc/xmon/xmon.c

index 1b7fd535393b56758aa97c062edd5cc95d6b1089..b62c31037cadefe742326c441b49ba5a0204f480 100644 (file)
@@ -239,8 +239,7 @@ struct paca_struct {
         */
        u64 exrfi[EX_SIZE] __aligned(0x80);
        void *rfi_flush_fallback_area;
-       u64 l1d_flush_congruence;
-       u64 l1d_flush_sets;
+       u64 l1d_flush_size;
 #endif
 };
 
index fa5c4125f42a582dc0ee218f815dc3b3f20fad73..88b84ac76b5325791b9df17e3d9a8bf9357aba1c 100644 (file)
@@ -239,8 +239,7 @@ int main(void)
        OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
        OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area);
        OFFSET(PACA_EXRFI, paca_struct, exrfi);
-       OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence);
-       OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets);
+       OFFSET(PACA_L1D_FLUSH_SIZE, paca_struct, l1d_flush_size);
 
 #endif
        OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
index 9e6882bdc526e8809061bab8e0a3aefb6f0d6a18..243d072a225aac1f7c7eaa69b6e5ef8cd21ce2c6 100644 (file)
@@ -1461,39 +1461,37 @@ TRAMP_REAL_BEGIN(rfi_flush_fallback)
        std     r9,PACA_EXRFI+EX_R9(r13)
        std     r10,PACA_EXRFI+EX_R10(r13)
        std     r11,PACA_EXRFI+EX_R11(r13)
-       std     r12,PACA_EXRFI+EX_R12(r13)
-       std     r8,PACA_EXRFI+EX_R13(r13)
        mfctr   r9
        ld      r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
-       ld      r11,PACA_L1D_FLUSH_SETS(r13)
-       ld      r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
-       /*
-        * The load adresses are at staggered offsets within cachelines,
-        * which suits some pipelines better (on others it should not
-        * hurt).
-        */
-       addi    r12,r12,8
+       ld      r11,PACA_L1D_FLUSH_SIZE(r13)
+       srdi    r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */
        mtctr   r11
        DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
 
        /* order ld/st prior to dcbt stop all streams with flushing */
        sync
-1:     li      r8,0
-       .rept   8 /* 8-way set associative */
-       ldx     r11,r10,r8
-       add     r8,r8,r12
-       xor     r11,r11,r11     // Ensure r11 is 0 even if fallback area is not
-       add     r8,r8,r11       // Add 0, this creates a dependency on the ldx
-       .endr
-       addi    r10,r10,128 /* 128 byte cache line */
+
+       /*
+        * The load adresses are at staggered offsets within cachelines,
+        * which suits some pipelines better (on others it should not
+        * hurt).
+        */
+1:
+       ld      r11,(0x80 + 8)*0(r10)
+       ld      r11,(0x80 + 8)*1(r10)
+       ld      r11,(0x80 + 8)*2(r10)
+       ld      r11,(0x80 + 8)*3(r10)
+       ld      r11,(0x80 + 8)*4(r10)
+       ld      r11,(0x80 + 8)*5(r10)
+       ld      r11,(0x80 + 8)*6(r10)
+       ld      r11,(0x80 + 8)*7(r10)
+       addi    r10,r10,0x80*8
        bdnz    1b
 
        mtctr   r9
        ld      r9,PACA_EXRFI+EX_R9(r13)
        ld      r10,PACA_EXRFI+EX_R10(r13)
        ld      r11,PACA_EXRFI+EX_R11(r13)
-       ld      r12,PACA_EXRFI+EX_R12(r13)
-       ld      r8,PACA_EXRFI+EX_R13(r13)
        GET_SCRATCH0(r13);
        rfid
 
@@ -1503,39 +1501,37 @@ TRAMP_REAL_BEGIN(hrfi_flush_fallback)
        std     r9,PACA_EXRFI+EX_R9(r13)
        std     r10,PACA_EXRFI+EX_R10(r13)
        std     r11,PACA_EXRFI+EX_R11(r13)
-       std     r12,PACA_EXRFI+EX_R12(r13)
-       std     r8,PACA_EXRFI+EX_R13(r13)
        mfctr   r9
        ld      r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
-       ld      r11,PACA_L1D_FLUSH_SETS(r13)
-       ld      r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
-       /*
-        * The load adresses are at staggered offsets within cachelines,
-        * which suits some pipelines better (on others it should not
-        * hurt).
-        */
-       addi    r12,r12,8
+       ld      r11,PACA_L1D_FLUSH_SIZE(r13)
+       srdi    r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */
        mtctr   r11
        DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
 
        /* order ld/st prior to dcbt stop all streams with flushing */
        sync
-1:     li      r8,0
-       .rept   8 /* 8-way set associative */
-       ldx     r11,r10,r8
-       add     r8,r8,r12
-       xor     r11,r11,r11     // Ensure r11 is 0 even if fallback area is not
-       add     r8,r8,r11       // Add 0, this creates a dependency on the ldx
-       .endr
-       addi    r10,r10,128 /* 128 byte cache line */
+
+       /*
+        * The load adresses are at staggered offsets within cachelines,
+        * which suits some pipelines better (on others it should not
+        * hurt).
+        */
+1:
+       ld      r11,(0x80 + 8)*0(r10)
+       ld      r11,(0x80 + 8)*1(r10)
+       ld      r11,(0x80 + 8)*2(r10)
+       ld      r11,(0x80 + 8)*3(r10)
+       ld      r11,(0x80 + 8)*4(r10)
+       ld      r11,(0x80 + 8)*5(r10)
+       ld      r11,(0x80 + 8)*6(r10)
+       ld      r11,(0x80 + 8)*7(r10)
+       addi    r10,r10,0x80*8
        bdnz    1b
 
        mtctr   r9
        ld      r9,PACA_EXRFI+EX_R9(r13)
        ld      r10,PACA_EXRFI+EX_R10(r13)
        ld      r11,PACA_EXRFI+EX_R11(r13)
-       ld      r12,PACA_EXRFI+EX_R12(r13)
-       ld      r8,PACA_EXRFI+EX_R13(r13)
        GET_SCRATCH0(r13);
        hrfid
 
index d1fa0e91f526ec3e1a82094160f4ea94d2e0c361..c388cc3357fa0e9f236277ac6f18a782a62c6bcd 100644 (file)
@@ -875,19 +875,8 @@ static void init_fallback_flush(void)
        memset(l1d_flush_fallback_area, 0, l1d_size * 2);
 
        for_each_possible_cpu(cpu) {
-               /*
-                * The fallback flush is currently coded for 8-way
-                * associativity. Different associativity is possible, but it
-                * will be treated as 8-way and may not evict the lines as
-                * effectively.
-                *
-                * 128 byte lines are mandatory.
-                */
-               u64 c = l1d_size / 8;
-
                paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area;
-               paca[cpu].l1d_flush_congruence = c;
-               paca[cpu].l1d_flush_sets = c / 128;
+               paca[cpu].l1d_flush_size = l1d_size;
        }
 }
 
index 01d9a2dcff2004b3912eb90173507961e61cc0a2..82e1a3ee6e0fc0e8bf53ea22e8dd986ab2de508b 100644 (file)
@@ -2377,8 +2377,6 @@ static void dump_one_paca(int cpu)
                printf(" slb_cache[%d]:        = 0x%016lx\n", i, p->slb_cache[i]);
 
        DUMP(p, rfi_flush_fallback_area, "px");
-       DUMP(p, l1d_flush_congruence, "llx");
-       DUMP(p, l1d_flush_sets, "llx");
 #endif
        DUMP(p, dscr_default, "llx");
 #ifdef CONFIG_PPC_BOOK3E