crypto: arm64/aes-neon-blk - tweak performance for low end cores

author Ard Biesheuvel <ard.biesheuvel@linaro.org>

Sat, 28 Jan 2017 23:25:38 +0000 (23:25 +0000)

committer Herbert Xu <herbert@gondor.apana.org.au>

Fri, 3 Feb 2017 10:16:20 +0000 (18:16 +0800)
author Ard Biesheuvel <ard.biesheuvel@linaro.org>
Sat, 28 Jan 2017 23:25:38 +0000 (23:25 +0000)
committer Herbert Xu <herbert@gondor.apana.org.au>
Fri, 3 Feb 2017 10:16:20 +0000 (18:16 +0800)
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c

index 8ee1fb7aaa4fdc9e2535f06d982a90f49fec3881..055bc3f61138843f337754e0def2e68abc7e47bc 100644 (file)
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -409,5 +409,7 @@ unregister_simds:
  module_cpu_feature_match(AES, aes_init);
  #else
  module_init(aes_init);
+EXPORT_SYMBOL(neon_aes_ecb_encrypt);
+EXPORT_SYMBOL(neon_aes_cbc_encrypt);
  #endif
  module_exit(aes_exit);
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S

index 85f07ead7c5c2792c44df0c2970774c60159615d..f1e3aa2732f93bb98b4be144f6caec0e9a6cf5b6 100644 (file)
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -1,7 +1,7 @@
  /*
   * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
   *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License version 2 as
@@ -17,17 +17,25 @@
         /* multiply by polynomial 'x' in GF(2^8) */
         .macro          mul_by_x, out, in, temp, const
         sshr            \temp, \in, #7
-       add             \out, \in, \in
+       shl             \out, \in, #1
         and             \temp, \temp, \const
         eor             \out, \out, \temp
         .endm
  
+       /* multiply by polynomial 'x^2' in GF(2^8) */
+       .macro          mul_by_x2, out, in, temp, const
+       ushr            \temp, \in, #6
+       shl             \out, \in, #2
+       pmul            \temp, \temp, \const
+       eor             \out, \out, \temp
+       .endm
+
         /* preload the entire Sbox */
         .macro          prepare, sbox, shiftrows, temp
         adr             \temp, \sbox
-       movi            v12.16b, #0x40
+       movi            v12.16b, #0x1b
         ldr             q13, \shiftrows
-       movi            v14.16b, #0x1b
+       ldr             q14, .Lror32by8
         ld1             {v16.16b-v19.16b}, [\temp], #64
         ld1             {v20.16b-v23.16b}, [\temp], #64
         ld1             {v24.16b-v27.16b}, [\temp], #64
@@ -50,37 +58,31 @@
  
         /* apply SubBytes transformation using the the preloaded Sbox */
         .macro          sub_bytes, in
-       sub             v9.16b, \in\().16b, v12.16b
+       sub             v9.16b, \in\().16b, v15.16b
         tbl             \in\().16b, {v16.16b-v19.16b}, \in\().16b
-       sub             v10.16b, v9.16b, v12.16b
+       sub             v10.16b, v9.16b, v15.16b
         tbx             \in\().16b, {v20.16b-v23.16b}, v9.16b
-       sub             v11.16b, v10.16b, v12.16b
+       sub             v11.16b, v10.16b, v15.16b
         tbx             \in\().16b, {v24.16b-v27.16b}, v10.16b
         tbx             \in\().16b, {v28.16b-v31.16b}, v11.16b
         .endm
  
         /* apply MixColumns transformation */
-       .macro          mix_columns, in
-       mul_by_x        v10.16b, \in\().16b, v9.16b, v14.16b
-       rev32           v8.8h, \in\().8h
-       eor             \in\().16b, v10.16b, \in\().16b
-       shl             v9.4s, v8.4s, #24
-       shl             v11.4s, \in\().4s, #24
-       sri             v9.4s, v8.4s, #8
-       sri             v11.4s, \in\().4s, #8
-       eor             v9.16b, v9.16b, v8.16b
-       eor             v10.16b, v10.16b, v9.16b
-       eor             \in\().16b, v10.16b, v11.16b
-       .endm
-
+       .macro          mix_columns, in, enc
+       .if             \enc == 0
         /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
-       .macro          inv_mix_columns, in
-       mul_by_x        v11.16b, \in\().16b, v10.16b, v14.16b
-       mul_by_x        v11.16b, v11.16b, v10.16b, v14.16b
-       eor             \in\().16b, \in\().16b, v11.16b
-       rev32           v11.8h, v11.8h
-       eor             \in\().16b, \in\().16b, v11.16b
-       mix_columns     \in
+       mul_by_x2       v8.16b, \in\().16b, v9.16b, v12.16b
+       eor             \in\().16b, \in\().16b, v8.16b
+       rev32           v8.8h, v8.8h
+       eor             \in\().16b, \in\().16b, v8.16b
+       .endif
+
+       mul_by_x        v9.16b, \in\().16b, v8.16b, v12.16b
+       rev32           v8.8h, \in\().8h
+       eor             v8.16b, v8.16b, v9.16b
+       eor             \in\().16b, \in\().16b, v8.16b
+       tbl             \in\().16b, {\in\().16b}, v14.16b
+       eor             \in\().16b, \in\().16b, v8.16b
         .endm
  
         .macro          do_block, enc, in, rounds, rk, rkp, i
@@ -88,16 +90,13 @@
         add             \rkp, \rk, #16
         mov             \i, \rounds
  1111:  eor             \in\().16b, \in\().16b, v15.16b         /* ^round key */
+       movi            v15.16b, #0x40
         tbl             \in\().16b, {\in\().16b}, v13.16b       /* ShiftRows */
         sub_bytes       \in
-       ld1             {v15.4s}, [\rkp], #16
         subs            \i, \i, #1
+       ld1             {v15.4s}, [\rkp], #16
         beq             2222f
-       .if             \enc == 1
-       mix_columns     \in
-       .else
-       inv_mix_columns \in
-       .endif
+       mix_columns     \in, \enc
         b               1111b
  2222:  eor             \in\().16b, \in\().16b, v15.16b         /* ^round key */
         .endm
@@ -116,139 +115,114 @@
          */
  
         .macro          sub_bytes_2x, in0, in1
-       sub             v8.16b, \in0\().16b, v12.16b
-       sub             v9.16b, \in1\().16b, v12.16b
+       sub             v8.16b, \in0\().16b, v15.16b
         tbl             \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+       sub             v9.16b, \in1\().16b, v15.16b
         tbl             \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
-       sub             v10.16b, v8.16b, v12.16b
-       sub             v11.16b, v9.16b, v12.16b
+       sub             v10.16b, v8.16b, v15.16b
         tbx             \in0\().16b, {v20.16b-v23.16b}, v8.16b
+       sub             v11.16b, v9.16b, v15.16b
         tbx             \in1\().16b, {v20.16b-v23.16b}, v9.16b
-       sub             v8.16b, v10.16b, v12.16b
-       sub             v9.16b, v11.16b, v12.16b
+       sub             v8.16b, v10.16b, v15.16b
         tbx             \in0\().16b, {v24.16b-v27.16b}, v10.16b
+       sub             v9.16b, v11.16b, v15.16b
         tbx             \in1\().16b, {v24.16b-v27.16b}, v11.16b
         tbx             \in0\().16b, {v28.16b-v31.16b}, v8.16b
         tbx             \in1\().16b, {v28.16b-v31.16b}, v9.16b
         .endm
  
         .macro          sub_bytes_4x, in0, in1, in2, in3
-       sub             v8.16b, \in0\().16b, v12.16b
+       sub             v8.16b, \in0\().16b, v15.16b
         tbl             \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
-       sub             v9.16b, \in1\().16b, v12.16b
+       sub             v9.16b, \in1\().16b, v15.16b
         tbl             \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
-       sub             v10.16b, \in2\().16b, v12.16b
+       sub             v10.16b, \in2\().16b, v15.16b
         tbl             \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
-       sub             v11.16b, \in3\().16b, v12.16b
+       sub             v11.16b, \in3\().16b, v15.16b
         tbl             \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
         tbx             \in0\().16b, {v20.16b-v23.16b}, v8.16b
         tbx             \in1\().16b, {v20.16b-v23.16b}, v9.16b
-       sub             v8.16b, v8.16b, v12.16b
+       sub             v8.16b, v8.16b, v15.16b
         tbx             \in2\().16b, {v20.16b-v23.16b}, v10.16b
-       sub             v9.16b, v9.16b, v12.16b
+       sub             v9.16b, v9.16b, v15.16b
         tbx             \in3\().16b, {v20.16b-v23.16b}, v11.16b
-       sub             v10.16b, v10.16b, v12.16b
+       sub             v10.16b, v10.16b, v15.16b
         tbx             \in0\().16b, {v24.16b-v27.16b}, v8.16b
-       sub             v11.16b, v11.16b, v12.16b
+       sub             v11.16b, v11.16b, v15.16b
         tbx             \in1\().16b, {v24.16b-v27.16b}, v9.16b
-       sub             v8.16b, v8.16b, v12.16b
+       sub             v8.16b, v8.16b, v15.16b
         tbx             \in2\().16b, {v24.16b-v27.16b}, v10.16b
-       sub             v9.16b, v9.16b, v12.16b
+       sub             v9.16b, v9.16b, v15.16b
         tbx             \in3\().16b, {v24.16b-v27.16b}, v11.16b
-       sub             v10.16b, v10.16b, v12.16b
+       sub             v10.16b, v10.16b, v15.16b
         tbx             \in0\().16b, {v28.16b-v31.16b}, v8.16b
-       sub             v11.16b, v11.16b, v12.16b
+       sub             v11.16b, v11.16b, v15.16b
         tbx             \in1\().16b, {v28.16b-v31.16b}, v9.16b
         tbx             \in2\().16b, {v28.16b-v31.16b}, v10.16b
         tbx             \in3\().16b, {v28.16b-v31.16b}, v11.16b
         .endm
  
         .macro          mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
-       sshr            \tmp0\().16b, \in0\().16b,  #7
-       add             \out0\().16b, \in0\().16b,  \in0\().16b
-       sshr            \tmp1\().16b, \in1\().16b,  #7
+       sshr            \tmp0\().16b, \in0\().16b, #7
+       shl             \out0\().16b, \in0\().16b, #1
+       sshr            \tmp1\().16b, \in1\().16b, #7
         and             \tmp0\().16b, \tmp0\().16b, \const\().16b
-       add             \out1\().16b, \in1\().16b,  \in1\().16b
+       shl             \out1\().16b, \in1\().16b, #1
         and             \tmp1\().16b, \tmp1\().16b, \const\().16b
         eor             \out0\().16b, \out0\().16b, \tmp0\().16b
         eor             \out1\().16b, \out1\().16b, \tmp1\().16b
         .endm
  
-       .macro          mix_columns_2x, in0, in1
-       mul_by_x_2x     v8, v9, \in0, \in1, v10, v11, v14
-       rev32           v10.8h, \in0\().8h
-       rev32           v11.8h, \in1\().8h
-       eor             \in0\().16b, v8.16b, \in0\().16b
-       eor             \in1\().16b, v9.16b, \in1\().16b
-       shl             v12.4s, v10.4s, #24
-       shl             v13.4s, v11.4s, #24
-       eor             v8.16b, v8.16b, v10.16b
-       sri             v12.4s, v10.4s, #8
-       shl             v10.4s, \in0\().4s, #24
-       eor             v9.16b, v9.16b, v11.16b
-       sri             v13.4s, v11.4s, #8
-       shl             v11.4s, \in1\().4s, #24
-       sri             v10.4s, \in0\().4s, #8
-       eor             \in0\().16b, v8.16b, v12.16b
-       sri             v11.4s, \in1\().4s, #8
-       eor             \in1\().16b, v9.16b, v13.16b
-       eor             \in0\().16b, v10.16b, \in0\().16b
-       eor             \in1\().16b, v11.16b, \in1\().16b
+       .macro          mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
+       ushr            \tmp0\().16b, \in0\().16b, #6
+       shl             \out0\().16b, \in0\().16b, #2
+       ushr            \tmp1\().16b, \in1\().16b, #6
+       pmul            \tmp0\().16b, \tmp0\().16b, \const\().16b
+       shl             \out1\().16b, \in1\().16b, #2
+       pmul            \tmp1\().16b, \tmp1\().16b, \const\().16b
+       eor             \out0\().16b, \out0\().16b, \tmp0\().16b
+       eor             \out1\().16b, \out1\().16b, \tmp1\().16b
         .endm
  
-       .macro          inv_mix_cols_2x, in0, in1
-       mul_by_x_2x     v8, v9, \in0, \in1, v10, v11, v14
-       mul_by_x_2x     v8, v9, v8, v9, v10, v11, v14
+       .macro          mix_columns_2x, in0, in1, enc
+       .if             \enc == 0
+       /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
+       mul_by_x2_2x    v8, v9, \in0, \in1, v10, v11, v12
         eor             \in0\().16b, \in0\().16b, v8.16b
-       eor             \in1\().16b, \in1\().16b, v9.16b
         rev32           v8.8h, v8.8h
-       rev32           v9.8h, v9.8h
-       eor             \in0\().16b, \in0\().16b, v8.16b
-       eor             \in1\().16b, \in1\().16b, v9.16b
-       mix_columns_2x  \in0, \in1
-       .endm
-
-       .macro          inv_mix_cols_4x, in0, in1, in2, in3
-       mul_by_x_2x     v8, v9, \in0, \in1, v10, v11, v14
-       mul_by_x_2x     v10, v11, \in2, \in3, v12, v13, v14
-       mul_by_x_2x     v8, v9, v8, v9, v12, v13, v14
-       mul_by_x_2x     v10, v11, v10, v11, v12, v13, v14
-       eor             \in0\().16b, \in0\().16b, v8.16b
         eor             \in1\().16b, \in1\().16b, v9.16b
-       eor             \in2\().16b, \in2\().16b, v10.16b
-       eor             \in3\().16b, \in3\().16b, v11.16b
-       rev32           v8.8h, v8.8h
         rev32           v9.8h, v9.8h
-       rev32           v10.8h, v10.8h
-       rev32           v11.8h, v11.8h
         eor             \in0\().16b, \in0\().16b, v8.16b
         eor             \in1\().16b, \in1\().16b, v9.16b
-       eor             \in2\().16b, \in2\().16b, v10.16b
-       eor             \in3\().16b, \in3\().16b, v11.16b
-       mix_columns_2x  \in0, \in1
-       mix_columns_2x  \in2, \in3
+       .endif
+
+       mul_by_x_2x     v8, v9, \in0, \in1, v10, v11, v12
+       rev32           v10.8h, \in0\().8h
+       rev32           v11.8h, \in1\().8h
+       eor             v10.16b, v10.16b, v8.16b
+       eor             v11.16b, v11.16b, v9.16b
+       eor             \in0\().16b, \in0\().16b, v10.16b
+       eor             \in1\().16b, \in1\().16b, v11.16b
+       tbl             \in0\().16b, {\in0\().16b}, v14.16b
+       tbl             \in1\().16b, {\in1\().16b}, v14.16b
+       eor             \in0\().16b, \in0\().16b, v10.16b
+       eor             \in1\().16b, \in1\().16b, v11.16b
         .endm
  
-       .macro          do_block_2x, enc, in0, in1 rounds, rk, rkp, i
+       .macro          do_block_2x, enc, in0, in1, rounds, rk, rkp, i
         ld1             {v15.4s}, [\rk]
         add             \rkp, \rk, #16
         mov             \i, \rounds
  1111:  eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
         eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
-       sub_bytes_2x    \in0, \in1
+       movi            v15.16b, #0x40
         tbl             \in0\().16b, {\in0\().16b}, v13.16b     /* ShiftRows */
         tbl             \in1\().16b, {\in1\().16b}, v13.16b     /* ShiftRows */
-       ld1             {v15.4s}, [\rkp], #16
+       sub_bytes_2x    \in0, \in1
         subs            \i, \i, #1
+       ld1             {v15.4s}, [\rkp], #16
         beq             2222f
-       .if             \enc == 1
-       mix_columns_2x  \in0, \in1
-       ldr             q13, .LForward_ShiftRows
-       .else
-       inv_mix_cols_2x \in0, \in1
-       ldr             q13, .LReverse_ShiftRows
-       .endif
-       movi            v12.16b, #0x40
+       mix_columns_2x  \in0, \in1, \enc
         b               1111b
  2222:  eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
         eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
@@ -262,23 +236,17 @@
         eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
         eor             \in2\().16b, \in2\().16b, v15.16b       /* ^round key */
         eor             \in3\().16b, \in3\().16b, v15.16b       /* ^round key */
-       sub_bytes_4x    \in0, \in1, \in2, \in3
+       movi            v15.16b, #0x40
         tbl             \in0\().16b, {\in0\().16b}, v13.16b     /* ShiftRows */
         tbl             \in1\().16b, {\in1\().16b}, v13.16b     /* ShiftRows */
         tbl             \in2\().16b, {\in2\().16b}, v13.16b     /* ShiftRows */
         tbl             \in3\().16b, {\in3\().16b}, v13.16b     /* ShiftRows */
-       ld1             {v15.4s}, [\rkp], #16
+       sub_bytes_4x    \in0, \in1, \in2, \in3
         subs            \i, \i, #1
+       ld1             {v15.4s}, [\rkp], #16
         beq             2222f
-       .if             \enc == 1
-       mix_columns_2x  \in0, \in1
-       mix_columns_2x  \in2, \in3
-       ldr             q13, .LForward_ShiftRows
-       .else
-       inv_mix_cols_4x \in0, \in1, \in2, \in3
-       ldr             q13, .LReverse_ShiftRows
-       .endif
-       movi            v12.16b, #0x40
+       mix_columns_2x  \in0, \in1, \enc
+       mix_columns_2x  \in2, \in3, \enc
         b               1111b
  2222:  eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
         eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
@@ -305,19 +273,7 @@
  #include "aes-modes.S"
  
         .text
-       .align          4
-.LForward_ShiftRows:
-CPU_LE(        .byte           0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3  )
-CPU_LE(        .byte           0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb  )
-CPU_BE(        .byte           0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8  )
-CPU_BE(        .byte           0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0  )
-
-.LReverse_ShiftRows:
-CPU_LE(        .byte           0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb  )
-CPU_LE(        .byte           0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3  )
-CPU_BE(        .byte           0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8  )
-CPU_BE(        .byte           0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0  )
-
+       .align          6
  .LForward_Sbox:
         .byte           0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
         .byte           0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
@@ -385,3 +341,12 @@ CPU_BE(    .byte           0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0  )
         .byte           0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
         .byte           0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
         .byte           0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+
+.LForward_ShiftRows:
+       .octa           0x0b06010c07020d08030e09040f0a0500
+
+.LReverse_ShiftRows:
+       .octa           0x0306090c0f0205080b0e0104070a0d00
+
+.Lror32by8:
+       .octa           0x0c0f0e0d080b0a090407060500030201
author	Ard Biesheuvel <ard.biesheuvel@linaro.org>
	Sat, 28 Jan 2017 23:25:38 +0000 (23:25 +0000)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Fri, 3 Feb 2017 10:16:20 +0000 (18:16 +0800)
arch/arm64/crypto/aes-glue.c		patch \| blob \| history
arch/arm64/crypto/aes-neon.S		patch \| blob \| history