aesni-intel_asm.S 76 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783
  1. /*
  2. * Implement AES algorithm in Intel AES-NI instructions.
  3. *
  4. * The white paper of AES-NI instructions can be downloaded from:
  5. * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
  6. *
  7. * Copyright (C) 2008, Intel Corp.
  8. * Author: Huang Ying <ying.huang@intel.com>
  9. * Vinodh Gopal <vinodh.gopal@intel.com>
  10. * Kahraman Akdemir
  11. *
  12. * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13. * interface for 64-bit kernels.
  14. * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15. * Aidan O'Mahony (aidan.o.mahony@intel.com)
  16. * Adrian Hoban <adrian.hoban@intel.com>
  17. * James Guilford (james.guilford@intel.com)
  18. * Gabriele Paoloni <gabriele.paoloni@intel.com>
  19. * Tadeusz Struk (tadeusz.struk@intel.com)
  20. * Wajdi Feghali (wajdi.k.feghali@intel.com)
  21. * Copyright (c) 2010, Intel Corporation.
  22. *
  23. * Ported x86_64 version to x86:
  24. * Author: Mathias Krause <minipli@googlemail.com>
  25. *
  26. * This program is free software; you can redistribute it and/or modify
  27. * it under the terms of the GNU General Public License as published by
  28. * the Free Software Foundation; either version 2 of the License, or
  29. * (at your option) any later version.
  30. */
  31. #include <linux/linkage.h>
  32. #include <asm/inst.h>
  33. /*
  34. * The following macros are used to move an (un)aligned 16 byte value to/from
  35. * an XMM register. This can done for either FP or integer values, for FP use
  36. * movaps (move aligned packed single) or integer use movdqa (move double quad
  37. * aligned). It doesn't make a performance difference which instruction is used
  38. * since Nehalem (original Core i7) was released. However, the movaps is a byte
  39. * shorter, so that is the one we'll use for now. (same for unaligned).
  40. */
  41. #define MOVADQ movaps
  42. #define MOVUDQ movups
  43. #ifdef __x86_64__
  44. .data
  45. .align 16
  46. .Lgf128mul_x_ble_mask:
  47. .octa 0x00000000000000010000000000000087
  48. POLY: .octa 0xC2000000000000000000000000000001
  49. TWOONE: .octa 0x00000001000000000000000000000001
  50. # order of these constants should not change.
  51. # more specifically, ALL_F should follow SHIFT_MASK,
  52. # and ZERO should follow ALL_F
  53. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  54. MASK1: .octa 0x0000000000000000ffffffffffffffff
  55. MASK2: .octa 0xffffffffffffffff0000000000000000
  56. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  57. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  58. ZERO: .octa 0x00000000000000000000000000000000
  59. ONE: .octa 0x00000000000000000000000000000001
  60. F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  61. dec: .octa 0x1
  62. enc: .octa 0x2
  63. .text
  64. #define STACK_OFFSET 8*3
  65. #define HashKey 16*0 // store HashKey <<1 mod poly here
  66. #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
  67. #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
  68. #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
  69. #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
  70. // bits of HashKey <<1 mod poly here
  71. //(for Karatsuba purposes)
  72. #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
  73. // bits of HashKey^2 <<1 mod poly here
  74. // (for Karatsuba purposes)
  75. #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
  76. // bits of HashKey^3 <<1 mod poly here
  77. // (for Karatsuba purposes)
  78. #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
  79. // bits of HashKey^4 <<1 mod poly here
  80. // (for Karatsuba purposes)
  81. #define VARIABLE_OFFSET 16*8
  82. #define arg1 rdi
  83. #define arg2 rsi
  84. #define arg3 rdx
  85. #define arg4 rcx
  86. #define arg5 r8
  87. #define arg6 r9
  88. #define arg7 STACK_OFFSET+8(%r14)
  89. #define arg8 STACK_OFFSET+16(%r14)
  90. #define arg9 STACK_OFFSET+24(%r14)
  91. #define arg10 STACK_OFFSET+32(%r14)
  92. #define keysize 2*15*16(%arg1)
  93. #endif
  94. #define STATE1 %xmm0
  95. #define STATE2 %xmm4
  96. #define STATE3 %xmm5
  97. #define STATE4 %xmm6
  98. #define STATE STATE1
  99. #define IN1 %xmm1
  100. #define IN2 %xmm7
  101. #define IN3 %xmm8
  102. #define IN4 %xmm9
  103. #define IN IN1
  104. #define KEY %xmm2
  105. #define IV %xmm3
  106. #define BSWAP_MASK %xmm10
  107. #define CTR %xmm11
  108. #define INC %xmm12
  109. #define GF128MUL_MASK %xmm10
  110. #ifdef __x86_64__
  111. #define AREG %rax
  112. #define KEYP %rdi
  113. #define OUTP %rsi
  114. #define UKEYP OUTP
  115. #define INP %rdx
  116. #define LEN %rcx
  117. #define IVP %r8
  118. #define KLEN %r9d
  119. #define T1 %r10
  120. #define TKEYP T1
  121. #define T2 %r11
  122. #define TCTR_LOW T2
  123. #else
  124. #define AREG %eax
  125. #define KEYP %edi
  126. #define OUTP AREG
  127. #define UKEYP OUTP
  128. #define INP %edx
  129. #define LEN %esi
  130. #define IVP %ebp
  131. #define KLEN %ebx
  132. #define T1 %ecx
  133. #define TKEYP T1
  134. #endif
  135. #ifdef __x86_64__
  136. /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  137. *
  138. *
  139. * Input: A and B (128-bits each, bit-reflected)
  140. * Output: C = A*B*x mod poly, (i.e. >>1 )
  141. * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  142. * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  143. *
  144. */
  145. .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
  146. movdqa \GH, \TMP1
  147. pshufd $78, \GH, \TMP2
  148. pshufd $78, \HK, \TMP3
  149. pxor \GH, \TMP2 # TMP2 = a1+a0
  150. pxor \HK, \TMP3 # TMP3 = b1+b0
  151. PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
  152. PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
  153. PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
  154. pxor \GH, \TMP2
  155. pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
  156. movdqa \TMP2, \TMP3
  157. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  158. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  159. pxor \TMP3, \GH
  160. pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
  161. # first phase of the reduction
  162. movdqa \GH, \TMP2
  163. movdqa \GH, \TMP3
  164. movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
  165. # in in order to perform
  166. # independent shifts
  167. pslld $31, \TMP2 # packed right shift <<31
  168. pslld $30, \TMP3 # packed right shift <<30
  169. pslld $25, \TMP4 # packed right shift <<25
  170. pxor \TMP3, \TMP2 # xor the shifted versions
  171. pxor \TMP4, \TMP2
  172. movdqa \TMP2, \TMP5
  173. psrldq $4, \TMP5 # right shift TMP5 1 DW
  174. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  175. pxor \TMP2, \GH
  176. # second phase of the reduction
  177. movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
  178. # in in order to perform
  179. # independent shifts
  180. movdqa \GH,\TMP3
  181. movdqa \GH,\TMP4
  182. psrld $1,\TMP2 # packed left shift >>1
  183. psrld $2,\TMP3 # packed left shift >>2
  184. psrld $7,\TMP4 # packed left shift >>7
  185. pxor \TMP3,\TMP2 # xor the shifted versions
  186. pxor \TMP4,\TMP2
  187. pxor \TMP5, \TMP2
  188. pxor \TMP2, \GH
  189. pxor \TMP1, \GH # result is in TMP1
  190. .endm
  191. /*
  192. * if a = number of total plaintext bytes
  193. * b = floor(a/16)
  194. * num_initial_blocks = b mod 4
  195. * encrypt the initial num_initial_blocks blocks and apply ghash on
  196. * the ciphertext
  197. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  198. * are clobbered
  199. * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
  200. */
  201. .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  202. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  203. MOVADQ SHUF_MASK(%rip), %xmm14
  204. mov arg7, %r10 # %r10 = AAD
  205. mov arg8, %r12 # %r12 = aadLen
  206. mov %r12, %r11
  207. pxor %xmm\i, %xmm\i
  208. _get_AAD_loop\num_initial_blocks\operation:
  209. movd (%r10), \TMP1
  210. pslldq $12, \TMP1
  211. psrldq $4, %xmm\i
  212. pxor \TMP1, %xmm\i
  213. add $4, %r10
  214. sub $4, %r12
  215. jne _get_AAD_loop\num_initial_blocks\operation
  216. cmp $16, %r11
  217. je _get_AAD_loop2_done\num_initial_blocks\operation
  218. mov $16, %r12
  219. _get_AAD_loop2\num_initial_blocks\operation:
  220. psrldq $4, %xmm\i
  221. sub $4, %r12
  222. cmp %r11, %r12
  223. jne _get_AAD_loop2\num_initial_blocks\operation
  224. _get_AAD_loop2_done\num_initial_blocks\operation:
  225. PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
  226. xor %r11, %r11 # initialise the data pointer offset as zero
  227. # start AES for num_initial_blocks blocks
  228. mov %arg5, %rax # %rax = *Y0
  229. movdqu (%rax), \XMM0 # XMM0 = Y0
  230. PSHUFB_XMM %xmm14, \XMM0
  231. .if (\i == 5) || (\i == 6) || (\i == 7)
  232. MOVADQ ONE(%RIP),\TMP1
  233. MOVADQ (%arg1),\TMP2
  234. .irpc index, \i_seq
  235. paddd \TMP1, \XMM0 # INCR Y0
  236. movdqa \XMM0, %xmm\index
  237. PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
  238. pxor \TMP2, %xmm\index
  239. .endr
  240. lea 0x10(%arg1),%r10
  241. mov keysize,%eax
  242. shr $2,%eax # 128->4, 192->6, 256->8
  243. add $5,%eax # 128->9, 192->11, 256->13
  244. aes_loop_initial_dec\num_initial_blocks:
  245. MOVADQ (%r10),\TMP1
  246. .irpc index, \i_seq
  247. AESENC \TMP1, %xmm\index
  248. .endr
  249. add $16,%r10
  250. sub $1,%eax
  251. jnz aes_loop_initial_dec\num_initial_blocks
  252. MOVADQ (%r10), \TMP1
  253. .irpc index, \i_seq
  254. AESENCLAST \TMP1, %xmm\index # Last Round
  255. .endr
  256. .irpc index, \i_seq
  257. movdqu (%arg3 , %r11, 1), \TMP1
  258. pxor \TMP1, %xmm\index
  259. movdqu %xmm\index, (%arg2 , %r11, 1)
  260. # write back plaintext/ciphertext for num_initial_blocks
  261. add $16, %r11
  262. movdqa \TMP1, %xmm\index
  263. PSHUFB_XMM %xmm14, %xmm\index
  264. # prepare plaintext/ciphertext for GHASH computation
  265. .endr
  266. .endif
  267. GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  268. # apply GHASH on num_initial_blocks blocks
  269. .if \i == 5
  270. pxor %xmm5, %xmm6
  271. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  272. pxor %xmm6, %xmm7
  273. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  274. pxor %xmm7, %xmm8
  275. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  276. .elseif \i == 6
  277. pxor %xmm6, %xmm7
  278. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  279. pxor %xmm7, %xmm8
  280. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  281. .elseif \i == 7
  282. pxor %xmm7, %xmm8
  283. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  284. .endif
  285. cmp $64, %r13
  286. jl _initial_blocks_done\num_initial_blocks\operation
  287. # no need for precomputed values
  288. /*
  289. *
  290. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  291. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  292. */
  293. MOVADQ ONE(%rip), \TMP1
  294. paddd \TMP1, \XMM0 # INCR Y0
  295. MOVADQ \XMM0, \XMM1
  296. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  297. paddd \TMP1, \XMM0 # INCR Y0
  298. MOVADQ \XMM0, \XMM2
  299. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  300. paddd \TMP1, \XMM0 # INCR Y0
  301. MOVADQ \XMM0, \XMM3
  302. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  303. paddd \TMP1, \XMM0 # INCR Y0
  304. MOVADQ \XMM0, \XMM4
  305. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  306. MOVADQ 0(%arg1),\TMP1
  307. pxor \TMP1, \XMM1
  308. pxor \TMP1, \XMM2
  309. pxor \TMP1, \XMM3
  310. pxor \TMP1, \XMM4
  311. movdqa \TMP3, \TMP5
  312. pshufd $78, \TMP3, \TMP1
  313. pxor \TMP3, \TMP1
  314. movdqa \TMP1, HashKey_k(%rsp)
  315. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  316. # TMP5 = HashKey^2<<1 (mod poly)
  317. movdqa \TMP5, HashKey_2(%rsp)
  318. # HashKey_2 = HashKey^2<<1 (mod poly)
  319. pshufd $78, \TMP5, \TMP1
  320. pxor \TMP5, \TMP1
  321. movdqa \TMP1, HashKey_2_k(%rsp)
  322. .irpc index, 1234 # do 4 rounds
  323. movaps 0x10*\index(%arg1), \TMP1
  324. AESENC \TMP1, \XMM1
  325. AESENC \TMP1, \XMM2
  326. AESENC \TMP1, \XMM3
  327. AESENC \TMP1, \XMM4
  328. .endr
  329. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  330. # TMP5 = HashKey^3<<1 (mod poly)
  331. movdqa \TMP5, HashKey_3(%rsp)
  332. pshufd $78, \TMP5, \TMP1
  333. pxor \TMP5, \TMP1
  334. movdqa \TMP1, HashKey_3_k(%rsp)
  335. .irpc index, 56789 # do next 5 rounds
  336. movaps 0x10*\index(%arg1), \TMP1
  337. AESENC \TMP1, \XMM1
  338. AESENC \TMP1, \XMM2
  339. AESENC \TMP1, \XMM3
  340. AESENC \TMP1, \XMM4
  341. .endr
  342. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  343. # TMP5 = HashKey^3<<1 (mod poly)
  344. movdqa \TMP5, HashKey_4(%rsp)
  345. pshufd $78, \TMP5, \TMP1
  346. pxor \TMP5, \TMP1
  347. movdqa \TMP1, HashKey_4_k(%rsp)
  348. lea 0xa0(%arg1),%r10
  349. mov keysize,%eax
  350. shr $2,%eax # 128->4, 192->6, 256->8
  351. sub $4,%eax # 128->0, 192->2, 256->4
  352. jz aes_loop_pre_dec_done\num_initial_blocks
  353. aes_loop_pre_dec\num_initial_blocks:
  354. MOVADQ (%r10),\TMP2
  355. .irpc index, 1234
  356. AESENC \TMP2, %xmm\index
  357. .endr
  358. add $16,%r10
  359. sub $1,%eax
  360. jnz aes_loop_pre_dec\num_initial_blocks
  361. aes_loop_pre_dec_done\num_initial_blocks:
  362. MOVADQ (%r10), \TMP2
  363. AESENCLAST \TMP2, \XMM1
  364. AESENCLAST \TMP2, \XMM2
  365. AESENCLAST \TMP2, \XMM3
  366. AESENCLAST \TMP2, \XMM4
  367. movdqu 16*0(%arg3 , %r11 , 1), \TMP1
  368. pxor \TMP1, \XMM1
  369. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  370. movdqa \TMP1, \XMM1
  371. movdqu 16*1(%arg3 , %r11 , 1), \TMP1
  372. pxor \TMP1, \XMM2
  373. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  374. movdqa \TMP1, \XMM2
  375. movdqu 16*2(%arg3 , %r11 , 1), \TMP1
  376. pxor \TMP1, \XMM3
  377. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  378. movdqa \TMP1, \XMM3
  379. movdqu 16*3(%arg3 , %r11 , 1), \TMP1
  380. pxor \TMP1, \XMM4
  381. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  382. movdqa \TMP1, \XMM4
  383. add $64, %r11
  384. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  385. pxor \XMMDst, \XMM1
  386. # combine GHASHed value with the corresponding ciphertext
  387. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  388. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  389. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  390. _initial_blocks_done\num_initial_blocks\operation:
  391. .endm
  392. /*
  393. * if a = number of total plaintext bytes
  394. * b = floor(a/16)
  395. * num_initial_blocks = b mod 4
  396. * encrypt the initial num_initial_blocks blocks and apply ghash on
  397. * the ciphertext
  398. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  399. * are clobbered
  400. * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
  401. */
  402. .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  403. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  404. MOVADQ SHUF_MASK(%rip), %xmm14
  405. mov arg7, %r10 # %r10 = AAD
  406. mov arg8, %r12 # %r12 = aadLen
  407. mov %r12, %r11
  408. pxor %xmm\i, %xmm\i
  409. _get_AAD_loop\num_initial_blocks\operation:
  410. movd (%r10), \TMP1
  411. pslldq $12, \TMP1
  412. psrldq $4, %xmm\i
  413. pxor \TMP1, %xmm\i
  414. add $4, %r10
  415. sub $4, %r12
  416. jne _get_AAD_loop\num_initial_blocks\operation
  417. cmp $16, %r11
  418. je _get_AAD_loop2_done\num_initial_blocks\operation
  419. mov $16, %r12
  420. _get_AAD_loop2\num_initial_blocks\operation:
  421. psrldq $4, %xmm\i
  422. sub $4, %r12
  423. cmp %r11, %r12
  424. jne _get_AAD_loop2\num_initial_blocks\operation
  425. _get_AAD_loop2_done\num_initial_blocks\operation:
  426. PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
  427. xor %r11, %r11 # initialise the data pointer offset as zero
  428. # start AES for num_initial_blocks blocks
  429. mov %arg5, %rax # %rax = *Y0
  430. movdqu (%rax), \XMM0 # XMM0 = Y0
  431. PSHUFB_XMM %xmm14, \XMM0
  432. .if (\i == 5) || (\i == 6) || (\i == 7)
  433. MOVADQ ONE(%RIP),\TMP1
  434. MOVADQ 0(%arg1),\TMP2
  435. .irpc index, \i_seq
  436. paddd \TMP1, \XMM0 # INCR Y0
  437. MOVADQ \XMM0, %xmm\index
  438. PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
  439. pxor \TMP2, %xmm\index
  440. .endr
  441. lea 0x10(%arg1),%r10
  442. mov keysize,%eax
  443. shr $2,%eax # 128->4, 192->6, 256->8
  444. add $5,%eax # 128->9, 192->11, 256->13
  445. aes_loop_initial_enc\num_initial_blocks:
  446. MOVADQ (%r10),\TMP1
  447. .irpc index, \i_seq
  448. AESENC \TMP1, %xmm\index
  449. .endr
  450. add $16,%r10
  451. sub $1,%eax
  452. jnz aes_loop_initial_enc\num_initial_blocks
  453. MOVADQ (%r10), \TMP1
  454. .irpc index, \i_seq
  455. AESENCLAST \TMP1, %xmm\index # Last Round
  456. .endr
  457. .irpc index, \i_seq
  458. movdqu (%arg3 , %r11, 1), \TMP1
  459. pxor \TMP1, %xmm\index
  460. movdqu %xmm\index, (%arg2 , %r11, 1)
  461. # write back plaintext/ciphertext for num_initial_blocks
  462. add $16, %r11
  463. PSHUFB_XMM %xmm14, %xmm\index
  464. # prepare plaintext/ciphertext for GHASH computation
  465. .endr
  466. .endif
  467. GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  468. # apply GHASH on num_initial_blocks blocks
  469. .if \i == 5
  470. pxor %xmm5, %xmm6
  471. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  472. pxor %xmm6, %xmm7
  473. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  474. pxor %xmm7, %xmm8
  475. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  476. .elseif \i == 6
  477. pxor %xmm6, %xmm7
  478. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  479. pxor %xmm7, %xmm8
  480. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  481. .elseif \i == 7
  482. pxor %xmm7, %xmm8
  483. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  484. .endif
  485. cmp $64, %r13
  486. jl _initial_blocks_done\num_initial_blocks\operation
  487. # no need for precomputed values
  488. /*
  489. *
  490. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  491. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  492. */
  493. MOVADQ ONE(%RIP),\TMP1
  494. paddd \TMP1, \XMM0 # INCR Y0
  495. MOVADQ \XMM0, \XMM1
  496. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  497. paddd \TMP1, \XMM0 # INCR Y0
  498. MOVADQ \XMM0, \XMM2
  499. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  500. paddd \TMP1, \XMM0 # INCR Y0
  501. MOVADQ \XMM0, \XMM3
  502. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  503. paddd \TMP1, \XMM0 # INCR Y0
  504. MOVADQ \XMM0, \XMM4
  505. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  506. MOVADQ 0(%arg1),\TMP1
  507. pxor \TMP1, \XMM1
  508. pxor \TMP1, \XMM2
  509. pxor \TMP1, \XMM3
  510. pxor \TMP1, \XMM4
  511. movdqa \TMP3, \TMP5
  512. pshufd $78, \TMP3, \TMP1
  513. pxor \TMP3, \TMP1
  514. movdqa \TMP1, HashKey_k(%rsp)
  515. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  516. # TMP5 = HashKey^2<<1 (mod poly)
  517. movdqa \TMP5, HashKey_2(%rsp)
  518. # HashKey_2 = HashKey^2<<1 (mod poly)
  519. pshufd $78, \TMP5, \TMP1
  520. pxor \TMP5, \TMP1
  521. movdqa \TMP1, HashKey_2_k(%rsp)
  522. .irpc index, 1234 # do 4 rounds
  523. movaps 0x10*\index(%arg1), \TMP1
  524. AESENC \TMP1, \XMM1
  525. AESENC \TMP1, \XMM2
  526. AESENC \TMP1, \XMM3
  527. AESENC \TMP1, \XMM4
  528. .endr
  529. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  530. # TMP5 = HashKey^3<<1 (mod poly)
  531. movdqa \TMP5, HashKey_3(%rsp)
  532. pshufd $78, \TMP5, \TMP1
  533. pxor \TMP5, \TMP1
  534. movdqa \TMP1, HashKey_3_k(%rsp)
  535. .irpc index, 56789 # do next 5 rounds
  536. movaps 0x10*\index(%arg1), \TMP1
  537. AESENC \TMP1, \XMM1
  538. AESENC \TMP1, \XMM2
  539. AESENC \TMP1, \XMM3
  540. AESENC \TMP1, \XMM4
  541. .endr
  542. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  543. # TMP5 = HashKey^3<<1 (mod poly)
  544. movdqa \TMP5, HashKey_4(%rsp)
  545. pshufd $78, \TMP5, \TMP1
  546. pxor \TMP5, \TMP1
  547. movdqa \TMP1, HashKey_4_k(%rsp)
  548. lea 0xa0(%arg1),%r10
  549. mov keysize,%eax
  550. shr $2,%eax # 128->4, 192->6, 256->8
  551. sub $4,%eax # 128->0, 192->2, 256->4
  552. jz aes_loop_pre_enc_done\num_initial_blocks
  553. aes_loop_pre_enc\num_initial_blocks:
  554. MOVADQ (%r10),\TMP2
  555. .irpc index, 1234
  556. AESENC \TMP2, %xmm\index
  557. .endr
  558. add $16,%r10
  559. sub $1,%eax
  560. jnz aes_loop_pre_enc\num_initial_blocks
  561. aes_loop_pre_enc_done\num_initial_blocks:
  562. MOVADQ (%r10), \TMP2
  563. AESENCLAST \TMP2, \XMM1
  564. AESENCLAST \TMP2, \XMM2
  565. AESENCLAST \TMP2, \XMM3
  566. AESENCLAST \TMP2, \XMM4
  567. movdqu 16*0(%arg3 , %r11 , 1), \TMP1
  568. pxor \TMP1, \XMM1
  569. movdqu 16*1(%arg3 , %r11 , 1), \TMP1
  570. pxor \TMP1, \XMM2
  571. movdqu 16*2(%arg3 , %r11 , 1), \TMP1
  572. pxor \TMP1, \XMM3
  573. movdqu 16*3(%arg3 , %r11 , 1), \TMP1
  574. pxor \TMP1, \XMM4
  575. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  576. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  577. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  578. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  579. add $64, %r11
  580. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  581. pxor \XMMDst, \XMM1
  582. # combine GHASHed value with the corresponding ciphertext
  583. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  584. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  585. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  586. _initial_blocks_done\num_initial_blocks\operation:
  587. .endm
  588. /*
  589. * encrypt 4 blocks at a time
  590. * ghash the 4 previously encrypted ciphertext blocks
  591. * arg1, %arg2, %arg3 are used as pointers only, not modified
  592. * %r11 is the data offset value
  593. */
  594. .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
  595. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  596. movdqa \XMM1, \XMM5
  597. movdqa \XMM2, \XMM6
  598. movdqa \XMM3, \XMM7
  599. movdqa \XMM4, \XMM8
  600. movdqa SHUF_MASK(%rip), %xmm15
  601. # multiply TMP5 * HashKey using karatsuba
  602. movdqa \XMM5, \TMP4
  603. pshufd $78, \XMM5, \TMP6
  604. pxor \XMM5, \TMP6
  605. paddd ONE(%rip), \XMM0 # INCR CNT
  606. movdqa HashKey_4(%rsp), \TMP5
  607. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  608. movdqa \XMM0, \XMM1
  609. paddd ONE(%rip), \XMM0 # INCR CNT
  610. movdqa \XMM0, \XMM2
  611. paddd ONE(%rip), \XMM0 # INCR CNT
  612. movdqa \XMM0, \XMM3
  613. paddd ONE(%rip), \XMM0 # INCR CNT
  614. movdqa \XMM0, \XMM4
  615. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  616. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  617. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  618. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  619. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  620. pxor (%arg1), \XMM1
  621. pxor (%arg1), \XMM2
  622. pxor (%arg1), \XMM3
  623. pxor (%arg1), \XMM4
  624. movdqa HashKey_4_k(%rsp), \TMP5
  625. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  626. movaps 0x10(%arg1), \TMP1
  627. AESENC \TMP1, \XMM1 # Round 1
  628. AESENC \TMP1, \XMM2
  629. AESENC \TMP1, \XMM3
  630. AESENC \TMP1, \XMM4
  631. movaps 0x20(%arg1), \TMP1
  632. AESENC \TMP1, \XMM1 # Round 2
  633. AESENC \TMP1, \XMM2
  634. AESENC \TMP1, \XMM3
  635. AESENC \TMP1, \XMM4
  636. movdqa \XMM6, \TMP1
  637. pshufd $78, \XMM6, \TMP2
  638. pxor \XMM6, \TMP2
  639. movdqa HashKey_3(%rsp), \TMP5
  640. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  641. movaps 0x30(%arg1), \TMP3
  642. AESENC \TMP3, \XMM1 # Round 3
  643. AESENC \TMP3, \XMM2
  644. AESENC \TMP3, \XMM3
  645. AESENC \TMP3, \XMM4
  646. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  647. movaps 0x40(%arg1), \TMP3
  648. AESENC \TMP3, \XMM1 # Round 4
  649. AESENC \TMP3, \XMM2
  650. AESENC \TMP3, \XMM3
  651. AESENC \TMP3, \XMM4
  652. movdqa HashKey_3_k(%rsp), \TMP5
  653. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  654. movaps 0x50(%arg1), \TMP3
  655. AESENC \TMP3, \XMM1 # Round 5
  656. AESENC \TMP3, \XMM2
  657. AESENC \TMP3, \XMM3
  658. AESENC \TMP3, \XMM4
  659. pxor \TMP1, \TMP4
  660. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  661. pxor \XMM6, \XMM5
  662. pxor \TMP2, \TMP6
  663. movdqa \XMM7, \TMP1
  664. pshufd $78, \XMM7, \TMP2
  665. pxor \XMM7, \TMP2
  666. movdqa HashKey_2(%rsp ), \TMP5
  667. # Multiply TMP5 * HashKey using karatsuba
  668. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  669. movaps 0x60(%arg1), \TMP3
  670. AESENC \TMP3, \XMM1 # Round 6
  671. AESENC \TMP3, \XMM2
  672. AESENC \TMP3, \XMM3
  673. AESENC \TMP3, \XMM4
  674. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  675. movaps 0x70(%arg1), \TMP3
  676. AESENC \TMP3, \XMM1 # Round 7
  677. AESENC \TMP3, \XMM2
  678. AESENC \TMP3, \XMM3
  679. AESENC \TMP3, \XMM4
  680. movdqa HashKey_2_k(%rsp), \TMP5
  681. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  682. movaps 0x80(%arg1), \TMP3
  683. AESENC \TMP3, \XMM1 # Round 8
  684. AESENC \TMP3, \XMM2
  685. AESENC \TMP3, \XMM3
  686. AESENC \TMP3, \XMM4
  687. pxor \TMP1, \TMP4
  688. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  689. pxor \XMM7, \XMM5
  690. pxor \TMP2, \TMP6
  691. # Multiply XMM8 * HashKey
  692. # XMM8 and TMP5 hold the values for the two operands
  693. movdqa \XMM8, \TMP1
  694. pshufd $78, \XMM8, \TMP2
  695. pxor \XMM8, \TMP2
  696. movdqa HashKey(%rsp), \TMP5
  697. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  698. movaps 0x90(%arg1), \TMP3
  699. AESENC \TMP3, \XMM1 # Round 9
  700. AESENC \TMP3, \XMM2
  701. AESENC \TMP3, \XMM3
  702. AESENC \TMP3, \XMM4
  703. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  704. lea 0xa0(%arg1),%r10
  705. mov keysize,%eax
  706. shr $2,%eax # 128->4, 192->6, 256->8
  707. sub $4,%eax # 128->0, 192->2, 256->4
  708. jz aes_loop_par_enc_done
  709. aes_loop_par_enc:
  710. MOVADQ (%r10),\TMP3
  711. .irpc index, 1234
  712. AESENC \TMP3, %xmm\index
  713. .endr
  714. add $16,%r10
  715. sub $1,%eax
  716. jnz aes_loop_par_enc
  717. aes_loop_par_enc_done:
  718. MOVADQ (%r10), \TMP3
  719. AESENCLAST \TMP3, \XMM1 # Round 10
  720. AESENCLAST \TMP3, \XMM2
  721. AESENCLAST \TMP3, \XMM3
  722. AESENCLAST \TMP3, \XMM4
  723. movdqa HashKey_k(%rsp), \TMP5
  724. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  725. movdqu (%arg3,%r11,1), \TMP3
  726. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  727. movdqu 16(%arg3,%r11,1), \TMP3
  728. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  729. movdqu 32(%arg3,%r11,1), \TMP3
  730. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  731. movdqu 48(%arg3,%r11,1), \TMP3
  732. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  733. movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
  734. movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
  735. movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
  736. movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
  737. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  738. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  739. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  740. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  741. pxor \TMP4, \TMP1
  742. pxor \XMM8, \XMM5
  743. pxor \TMP6, \TMP2
  744. pxor \TMP1, \TMP2
  745. pxor \XMM5, \TMP2
  746. movdqa \TMP2, \TMP3
  747. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  748. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  749. pxor \TMP3, \XMM5
  750. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  751. # first phase of reduction
  752. movdqa \XMM5, \TMP2
  753. movdqa \XMM5, \TMP3
  754. movdqa \XMM5, \TMP4
  755. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  756. pslld $31, \TMP2 # packed right shift << 31
  757. pslld $30, \TMP3 # packed right shift << 30
  758. pslld $25, \TMP4 # packed right shift << 25
  759. pxor \TMP3, \TMP2 # xor the shifted versions
  760. pxor \TMP4, \TMP2
  761. movdqa \TMP2, \TMP5
  762. psrldq $4, \TMP5 # right shift T5 1 DW
  763. pslldq $12, \TMP2 # left shift T2 3 DWs
  764. pxor \TMP2, \XMM5
  765. # second phase of reduction
  766. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  767. movdqa \XMM5,\TMP3
  768. movdqa \XMM5,\TMP4
  769. psrld $1, \TMP2 # packed left shift >>1
  770. psrld $2, \TMP3 # packed left shift >>2
  771. psrld $7, \TMP4 # packed left shift >>7
  772. pxor \TMP3,\TMP2 # xor the shifted versions
  773. pxor \TMP4,\TMP2
  774. pxor \TMP5, \TMP2
  775. pxor \TMP2, \XMM5
  776. pxor \TMP1, \XMM5 # result is in TMP1
  777. pxor \XMM5, \XMM1
  778. .endm
  779. /*
  780. * decrypt 4 blocks at a time
  781. * ghash the 4 previously decrypted ciphertext blocks
  782. * arg1, %arg2, %arg3 are used as pointers only, not modified
  783. * %r11 is the data offset value
  784. */
  785. .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
  786. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  787. movdqa \XMM1, \XMM5
  788. movdqa \XMM2, \XMM6
  789. movdqa \XMM3, \XMM7
  790. movdqa \XMM4, \XMM8
  791. movdqa SHUF_MASK(%rip), %xmm15
  792. # multiply TMP5 * HashKey using karatsuba
  793. movdqa \XMM5, \TMP4
  794. pshufd $78, \XMM5, \TMP6
  795. pxor \XMM5, \TMP6
  796. paddd ONE(%rip), \XMM0 # INCR CNT
  797. movdqa HashKey_4(%rsp), \TMP5
  798. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  799. movdqa \XMM0, \XMM1
  800. paddd ONE(%rip), \XMM0 # INCR CNT
  801. movdqa \XMM0, \XMM2
  802. paddd ONE(%rip), \XMM0 # INCR CNT
  803. movdqa \XMM0, \XMM3
  804. paddd ONE(%rip), \XMM0 # INCR CNT
  805. movdqa \XMM0, \XMM4
  806. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  807. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  808. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  809. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  810. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  811. pxor (%arg1), \XMM1
  812. pxor (%arg1), \XMM2
  813. pxor (%arg1), \XMM3
  814. pxor (%arg1), \XMM4
  815. movdqa HashKey_4_k(%rsp), \TMP5
  816. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  817. movaps 0x10(%arg1), \TMP1
  818. AESENC \TMP1, \XMM1 # Round 1
  819. AESENC \TMP1, \XMM2
  820. AESENC \TMP1, \XMM3
  821. AESENC \TMP1, \XMM4
  822. movaps 0x20(%arg1), \TMP1
  823. AESENC \TMP1, \XMM1 # Round 2
  824. AESENC \TMP1, \XMM2
  825. AESENC \TMP1, \XMM3
  826. AESENC \TMP1, \XMM4
  827. movdqa \XMM6, \TMP1
  828. pshufd $78, \XMM6, \TMP2
  829. pxor \XMM6, \TMP2
  830. movdqa HashKey_3(%rsp), \TMP5
  831. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  832. movaps 0x30(%arg1), \TMP3
  833. AESENC \TMP3, \XMM1 # Round 3
  834. AESENC \TMP3, \XMM2
  835. AESENC \TMP3, \XMM3
  836. AESENC \TMP3, \XMM4
  837. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  838. movaps 0x40(%arg1), \TMP3
  839. AESENC \TMP3, \XMM1 # Round 4
  840. AESENC \TMP3, \XMM2
  841. AESENC \TMP3, \XMM3
  842. AESENC \TMP3, \XMM4
  843. movdqa HashKey_3_k(%rsp), \TMP5
  844. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  845. movaps 0x50(%arg1), \TMP3
  846. AESENC \TMP3, \XMM1 # Round 5
  847. AESENC \TMP3, \XMM2
  848. AESENC \TMP3, \XMM3
  849. AESENC \TMP3, \XMM4
  850. pxor \TMP1, \TMP4
  851. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  852. pxor \XMM6, \XMM5
  853. pxor \TMP2, \TMP6
  854. movdqa \XMM7, \TMP1
  855. pshufd $78, \XMM7, \TMP2
  856. pxor \XMM7, \TMP2
  857. movdqa HashKey_2(%rsp ), \TMP5
  858. # Multiply TMP5 * HashKey using karatsuba
  859. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  860. movaps 0x60(%arg1), \TMP3
  861. AESENC \TMP3, \XMM1 # Round 6
  862. AESENC \TMP3, \XMM2
  863. AESENC \TMP3, \XMM3
  864. AESENC \TMP3, \XMM4
  865. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  866. movaps 0x70(%arg1), \TMP3
  867. AESENC \TMP3, \XMM1 # Round 7
  868. AESENC \TMP3, \XMM2
  869. AESENC \TMP3, \XMM3
  870. AESENC \TMP3, \XMM4
  871. movdqa HashKey_2_k(%rsp), \TMP5
  872. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  873. movaps 0x80(%arg1), \TMP3
  874. AESENC \TMP3, \XMM1 # Round 8
  875. AESENC \TMP3, \XMM2
  876. AESENC \TMP3, \XMM3
  877. AESENC \TMP3, \XMM4
  878. pxor \TMP1, \TMP4
  879. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  880. pxor \XMM7, \XMM5
  881. pxor \TMP2, \TMP6
  882. # Multiply XMM8 * HashKey
  883. # XMM8 and TMP5 hold the values for the two operands
  884. movdqa \XMM8, \TMP1
  885. pshufd $78, \XMM8, \TMP2
  886. pxor \XMM8, \TMP2
  887. movdqa HashKey(%rsp), \TMP5
  888. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  889. movaps 0x90(%arg1), \TMP3
  890. AESENC \TMP3, \XMM1 # Round 9
  891. AESENC \TMP3, \XMM2
  892. AESENC \TMP3, \XMM3
  893. AESENC \TMP3, \XMM4
  894. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  895. lea 0xa0(%arg1),%r10
  896. mov keysize,%eax
  897. shr $2,%eax # 128->4, 192->6, 256->8
  898. sub $4,%eax # 128->0, 192->2, 256->4
  899. jz aes_loop_par_dec_done
  900. aes_loop_par_dec:
  901. MOVADQ (%r10),\TMP3
  902. .irpc index, 1234
  903. AESENC \TMP3, %xmm\index
  904. .endr
  905. add $16,%r10
  906. sub $1,%eax
  907. jnz aes_loop_par_dec
  908. aes_loop_par_dec_done:
  909. MOVADQ (%r10), \TMP3
  910. AESENCLAST \TMP3, \XMM1 # last round
  911. AESENCLAST \TMP3, \XMM2
  912. AESENCLAST \TMP3, \XMM3
  913. AESENCLAST \TMP3, \XMM4
  914. movdqa HashKey_k(%rsp), \TMP5
  915. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  916. movdqu (%arg3,%r11,1), \TMP3
  917. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  918. movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
  919. movdqa \TMP3, \XMM1
  920. movdqu 16(%arg3,%r11,1), \TMP3
  921. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  922. movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
  923. movdqa \TMP3, \XMM2
  924. movdqu 32(%arg3,%r11,1), \TMP3
  925. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  926. movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
  927. movdqa \TMP3, \XMM3
  928. movdqu 48(%arg3,%r11,1), \TMP3
  929. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  930. movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
  931. movdqa \TMP3, \XMM4
  932. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  933. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  934. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  935. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  936. pxor \TMP4, \TMP1
  937. pxor \XMM8, \XMM5
  938. pxor \TMP6, \TMP2
  939. pxor \TMP1, \TMP2
  940. pxor \XMM5, \TMP2
  941. movdqa \TMP2, \TMP3
  942. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  943. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  944. pxor \TMP3, \XMM5
  945. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  946. # first phase of reduction
  947. movdqa \XMM5, \TMP2
  948. movdqa \XMM5, \TMP3
  949. movdqa \XMM5, \TMP4
  950. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  951. pslld $31, \TMP2 # packed right shift << 31
  952. pslld $30, \TMP3 # packed right shift << 30
  953. pslld $25, \TMP4 # packed right shift << 25
  954. pxor \TMP3, \TMP2 # xor the shifted versions
  955. pxor \TMP4, \TMP2
  956. movdqa \TMP2, \TMP5
  957. psrldq $4, \TMP5 # right shift T5 1 DW
  958. pslldq $12, \TMP2 # left shift T2 3 DWs
  959. pxor \TMP2, \XMM5
  960. # second phase of reduction
  961. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  962. movdqa \XMM5,\TMP3
  963. movdqa \XMM5,\TMP4
  964. psrld $1, \TMP2 # packed left shift >>1
  965. psrld $2, \TMP3 # packed left shift >>2
  966. psrld $7, \TMP4 # packed left shift >>7
  967. pxor \TMP3,\TMP2 # xor the shifted versions
  968. pxor \TMP4,\TMP2
  969. pxor \TMP5, \TMP2
  970. pxor \TMP2, \XMM5
  971. pxor \TMP1, \XMM5 # result is in TMP1
  972. pxor \XMM5, \XMM1
  973. .endm
  974. /* GHASH the last 4 ciphertext blocks. */
  975. .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
  976. TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
  977. # Multiply TMP6 * HashKey (using Karatsuba)
  978. movdqa \XMM1, \TMP6
  979. pshufd $78, \XMM1, \TMP2
  980. pxor \XMM1, \TMP2
  981. movdqa HashKey_4(%rsp), \TMP5
  982. PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
  983. PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
  984. movdqa HashKey_4_k(%rsp), \TMP4
  985. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  986. movdqa \XMM1, \XMMDst
  987. movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
  988. # Multiply TMP1 * HashKey (using Karatsuba)
  989. movdqa \XMM2, \TMP1
  990. pshufd $78, \XMM2, \TMP2
  991. pxor \XMM2, \TMP2
  992. movdqa HashKey_3(%rsp), \TMP5
  993. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  994. PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
  995. movdqa HashKey_3_k(%rsp), \TMP4
  996. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  997. pxor \TMP1, \TMP6
  998. pxor \XMM2, \XMMDst
  999. pxor \TMP2, \XMM1
  1000. # results accumulated in TMP6, XMMDst, XMM1
  1001. # Multiply TMP1 * HashKey (using Karatsuba)
  1002. movdqa \XMM3, \TMP1
  1003. pshufd $78, \XMM3, \TMP2
  1004. pxor \XMM3, \TMP2
  1005. movdqa HashKey_2(%rsp), \TMP5
  1006. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1007. PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
  1008. movdqa HashKey_2_k(%rsp), \TMP4
  1009. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1010. pxor \TMP1, \TMP6
  1011. pxor \XMM3, \XMMDst
  1012. pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
  1013. # Multiply TMP1 * HashKey (using Karatsuba)
  1014. movdqa \XMM4, \TMP1
  1015. pshufd $78, \XMM4, \TMP2
  1016. pxor \XMM4, \TMP2
  1017. movdqa HashKey(%rsp), \TMP5
  1018. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1019. PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
  1020. movdqa HashKey_k(%rsp), \TMP4
  1021. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1022. pxor \TMP1, \TMP6
  1023. pxor \XMM4, \XMMDst
  1024. pxor \XMM1, \TMP2
  1025. pxor \TMP6, \TMP2
  1026. pxor \XMMDst, \TMP2
  1027. # middle section of the temp results combined as in karatsuba algorithm
  1028. movdqa \TMP2, \TMP4
  1029. pslldq $8, \TMP4 # left shift TMP4 2 DWs
  1030. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  1031. pxor \TMP4, \XMMDst
  1032. pxor \TMP2, \TMP6
  1033. # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
  1034. # first phase of the reduction
  1035. movdqa \XMMDst, \TMP2
  1036. movdqa \XMMDst, \TMP3
  1037. movdqa \XMMDst, \TMP4
  1038. # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
  1039. pslld $31, \TMP2 # packed right shifting << 31
  1040. pslld $30, \TMP3 # packed right shifting << 30
  1041. pslld $25, \TMP4 # packed right shifting << 25
  1042. pxor \TMP3, \TMP2 # xor the shifted versions
  1043. pxor \TMP4, \TMP2
  1044. movdqa \TMP2, \TMP7
  1045. psrldq $4, \TMP7 # right shift TMP7 1 DW
  1046. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  1047. pxor \TMP2, \XMMDst
  1048. # second phase of the reduction
  1049. movdqa \XMMDst, \TMP2
  1050. # make 3 copies of XMMDst for doing 3 shift operations
  1051. movdqa \XMMDst, \TMP3
  1052. movdqa \XMMDst, \TMP4
  1053. psrld $1, \TMP2 # packed left shift >> 1
  1054. psrld $2, \TMP3 # packed left shift >> 2
  1055. psrld $7, \TMP4 # packed left shift >> 7
  1056. pxor \TMP3, \TMP2 # xor the shifted versions
  1057. pxor \TMP4, \TMP2
  1058. pxor \TMP7, \TMP2
  1059. pxor \TMP2, \XMMDst
  1060. pxor \TMP6, \XMMDst # reduced result is in XMMDst
  1061. .endm
  1062. /* Encryption of a single block
  1063. * uses eax & r10
  1064. */
  1065. .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
  1066. pxor (%arg1), \XMM0
  1067. mov keysize,%eax
  1068. shr $2,%eax # 128->4, 192->6, 256->8
  1069. add $5,%eax # 128->9, 192->11, 256->13
  1070. lea 16(%arg1), %r10 # get first expanded key address
  1071. _esb_loop_\@:
  1072. MOVADQ (%r10),\TMP1
  1073. AESENC \TMP1,\XMM0
  1074. add $16,%r10
  1075. sub $1,%eax
  1076. jnz _esb_loop_\@
  1077. MOVADQ (%r10),\TMP1
  1078. AESENCLAST \TMP1,\XMM0
  1079. .endm
  1080. /*****************************************************************************
  1081. * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1082. * u8 *out, // Plaintext output. Encrypt in-place is allowed.
  1083. * const u8 *in, // Ciphertext input
  1084. * u64 plaintext_len, // Length of data in bytes for decryption.
  1085. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1086. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1087. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1088. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1089. * const u8 *aad, // Additional Authentication Data (AAD)
  1090. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1091. * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
  1092. * // given authentication tag and only return the plaintext if they match.
  1093. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
  1094. * // (most likely), 12 or 8.
  1095. *
  1096. * Assumptions:
  1097. *
  1098. * keys:
  1099. * keys are pre-expanded and aligned to 16 bytes. we are using the first
  1100. * set of 11 keys in the data structure void *aes_ctx
  1101. *
  1102. * iv:
  1103. * 0 1 2 3
  1104. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1105. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1106. * | Salt (From the SA) |
  1107. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1108. * | Initialization Vector |
  1109. * | (This is the sequence number from IPSec header) |
  1110. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1111. * | 0x1 |
  1112. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1113. *
  1114. *
  1115. *
  1116. * AAD:
  1117. * AAD padded to 128 bits with 0
  1118. * for example, assume AAD is a u32 vector
  1119. *
  1120. * if AAD is 8 bytes:
  1121. * AAD[3] = {A0, A1};
  1122. * padded AAD in xmm register = {A1 A0 0 0}
  1123. *
  1124. * 0 1 2 3
  1125. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1126. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1127. * | SPI (A1) |
  1128. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1129. * | 32-bit Sequence Number (A0) |
  1130. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1131. * | 0x0 |
  1132. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1133. *
  1134. * AAD Format with 32-bit Sequence Number
  1135. *
  1136. * if AAD is 12 bytes:
  1137. * AAD[3] = {A0, A1, A2};
  1138. * padded AAD in xmm register = {A2 A1 A0 0}
  1139. *
  1140. * 0 1 2 3
  1141. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1142. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1143. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1144. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1145. * | SPI (A2) |
  1146. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1147. * | 64-bit Extended Sequence Number {A1,A0} |
  1148. * | |
  1149. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1150. * | 0x0 |
  1151. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1152. *
  1153. * AAD Format with 64-bit Extended Sequence Number
  1154. *
  1155. * aadLen:
  1156. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  1157. * The code supports 16 too but for other sizes, the code will fail.
  1158. *
  1159. * TLen:
  1160. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  1161. * For other sizes, the code will fail.
  1162. *
  1163. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1164. *
  1165. *****************************************************************************/
  1166. ENTRY(aesni_gcm_dec)
  1167. push %r12
  1168. push %r13
  1169. push %r14
  1170. mov %rsp, %r14
  1171. /*
  1172. * states of %xmm registers %xmm6:%xmm15 not saved
  1173. * all %xmm registers are clobbered
  1174. */
  1175. sub $VARIABLE_OFFSET, %rsp
  1176. and $~63, %rsp # align rsp to 64 bytes
  1177. mov %arg6, %r12
  1178. movdqu (%r12), %xmm13 # %xmm13 = HashKey
  1179. movdqa SHUF_MASK(%rip), %xmm2
  1180. PSHUFB_XMM %xmm2, %xmm13
  1181. # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
  1182. movdqa %xmm13, %xmm2
  1183. psllq $1, %xmm13
  1184. psrlq $63, %xmm2
  1185. movdqa %xmm2, %xmm1
  1186. pslldq $8, %xmm2
  1187. psrldq $8, %xmm1
  1188. por %xmm2, %xmm13
  1189. # Reduction
  1190. pshufd $0x24, %xmm1, %xmm2
  1191. pcmpeqd TWOONE(%rip), %xmm2
  1192. pand POLY(%rip), %xmm2
  1193. pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
  1194. # Decrypt first few blocks
  1195. movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
  1196. mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
  1197. and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
  1198. mov %r13, %r12
  1199. and $(3<<4), %r12
  1200. jz _initial_num_blocks_is_0_decrypt
  1201. cmp $(2<<4), %r12
  1202. jb _initial_num_blocks_is_1_decrypt
  1203. je _initial_num_blocks_is_2_decrypt
  1204. _initial_num_blocks_is_3_decrypt:
  1205. INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1206. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
  1207. sub $48, %r13
  1208. jmp _initial_blocks_decrypted
  1209. _initial_num_blocks_is_2_decrypt:
  1210. INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1211. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
  1212. sub $32, %r13
  1213. jmp _initial_blocks_decrypted
  1214. _initial_num_blocks_is_1_decrypt:
  1215. INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1216. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
  1217. sub $16, %r13
  1218. jmp _initial_blocks_decrypted
  1219. _initial_num_blocks_is_0_decrypt:
  1220. INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1221. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
  1222. _initial_blocks_decrypted:
  1223. cmp $0, %r13
  1224. je _zero_cipher_left_decrypt
  1225. sub $64, %r13
  1226. je _four_cipher_left_decrypt
  1227. _decrypt_by_4:
  1228. GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  1229. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
  1230. add $64, %r11
  1231. sub $64, %r13
  1232. jne _decrypt_by_4
  1233. _four_cipher_left_decrypt:
  1234. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  1235. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  1236. _zero_cipher_left_decrypt:
  1237. mov %arg4, %r13
  1238. and $15, %r13 # %r13 = arg4 (mod 16)
  1239. je _multiple_of_16_bytes_decrypt
  1240. # Handle the last <16 byte block separately
  1241. paddd ONE(%rip), %xmm0 # increment CNT to get Yn
  1242. movdqa SHUF_MASK(%rip), %xmm10
  1243. PSHUFB_XMM %xmm10, %xmm0
  1244. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
  1245. sub $16, %r11
  1246. add %r13, %r11
  1247. movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
  1248. lea SHIFT_MASK+16(%rip), %r12
  1249. sub %r13, %r12
  1250. # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
  1251. # (%r13 is the number of bytes in plaintext mod 16)
  1252. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1253. PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
  1254. movdqa %xmm1, %xmm2
  1255. pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
  1256. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  1257. # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
  1258. pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
  1259. pand %xmm1, %xmm2
  1260. movdqa SHUF_MASK(%rip), %xmm10
  1261. PSHUFB_XMM %xmm10 ,%xmm2
  1262. pxor %xmm2, %xmm8
  1263. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1264. # GHASH computation for the last <16 byte block
  1265. sub %r13, %r11
  1266. add $16, %r11
  1267. # output %r13 bytes
  1268. MOVQ_R64_XMM %xmm0, %rax
  1269. cmp $8, %r13
  1270. jle _less_than_8_bytes_left_decrypt
  1271. mov %rax, (%arg2 , %r11, 1)
  1272. add $8, %r11
  1273. psrldq $8, %xmm0
  1274. MOVQ_R64_XMM %xmm0, %rax
  1275. sub $8, %r13
  1276. _less_than_8_bytes_left_decrypt:
  1277. mov %al, (%arg2, %r11, 1)
  1278. add $1, %r11
  1279. shr $8, %rax
  1280. sub $1, %r13
  1281. jne _less_than_8_bytes_left_decrypt
  1282. _multiple_of_16_bytes_decrypt:
  1283. mov arg8, %r12 # %r13 = aadLen (number of bytes)
  1284. shl $3, %r12 # convert into number of bits
  1285. movd %r12d, %xmm15 # len(A) in %xmm15
  1286. shl $3, %arg4 # len(C) in bits (*128)
  1287. MOVQ_R64_XMM %arg4, %xmm1
  1288. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  1289. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  1290. pxor %xmm15, %xmm8
  1291. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1292. # final GHASH computation
  1293. movdqa SHUF_MASK(%rip), %xmm10
  1294. PSHUFB_XMM %xmm10, %xmm8
  1295. mov %arg5, %rax # %rax = *Y0
  1296. movdqu (%rax), %xmm0 # %xmm0 = Y0
  1297. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
  1298. pxor %xmm8, %xmm0
  1299. _return_T_decrypt:
  1300. mov arg9, %r10 # %r10 = authTag
  1301. mov arg10, %r11 # %r11 = auth_tag_len
  1302. cmp $16, %r11
  1303. je _T_16_decrypt
  1304. cmp $12, %r11
  1305. je _T_12_decrypt
  1306. _T_8_decrypt:
  1307. MOVQ_R64_XMM %xmm0, %rax
  1308. mov %rax, (%r10)
  1309. jmp _return_T_done_decrypt
  1310. _T_12_decrypt:
  1311. MOVQ_R64_XMM %xmm0, %rax
  1312. mov %rax, (%r10)
  1313. psrldq $8, %xmm0
  1314. movd %xmm0, %eax
  1315. mov %eax, 8(%r10)
  1316. jmp _return_T_done_decrypt
  1317. _T_16_decrypt:
  1318. movdqu %xmm0, (%r10)
  1319. _return_T_done_decrypt:
  1320. mov %r14, %rsp
  1321. pop %r14
  1322. pop %r13
  1323. pop %r12
  1324. ret
  1325. ENDPROC(aesni_gcm_dec)
  1326. /*****************************************************************************
  1327. * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1328. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  1329. * const u8 *in, // Plaintext input
  1330. * u64 plaintext_len, // Length of data in bytes for encryption.
  1331. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1332. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1333. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1334. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1335. * const u8 *aad, // Additional Authentication Data (AAD)
  1336. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1337. * u8 *auth_tag, // Authenticated Tag output.
  1338. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
  1339. * // 12 or 8.
  1340. *
  1341. * Assumptions:
  1342. *
  1343. * keys:
  1344. * keys are pre-expanded and aligned to 16 bytes. we are using the
  1345. * first set of 11 keys in the data structure void *aes_ctx
  1346. *
  1347. *
  1348. * iv:
  1349. * 0 1 2 3
  1350. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1351. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1352. * | Salt (From the SA) |
  1353. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1354. * | Initialization Vector |
  1355. * | (This is the sequence number from IPSec header) |
  1356. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1357. * | 0x1 |
  1358. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1359. *
  1360. *
  1361. *
  1362. * AAD:
  1363. * AAD padded to 128 bits with 0
  1364. * for example, assume AAD is a u32 vector
  1365. *
  1366. * if AAD is 8 bytes:
  1367. * AAD[3] = {A0, A1};
  1368. * padded AAD in xmm register = {A1 A0 0 0}
  1369. *
  1370. * 0 1 2 3
  1371. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1372. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1373. * | SPI (A1) |
  1374. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1375. * | 32-bit Sequence Number (A0) |
  1376. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1377. * | 0x0 |
  1378. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1379. *
  1380. * AAD Format with 32-bit Sequence Number
  1381. *
  1382. * if AAD is 12 bytes:
  1383. * AAD[3] = {A0, A1, A2};
  1384. * padded AAD in xmm register = {A2 A1 A0 0}
  1385. *
  1386. * 0 1 2 3
  1387. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1388. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1389. * | SPI (A2) |
  1390. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1391. * | 64-bit Extended Sequence Number {A1,A0} |
  1392. * | |
  1393. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1394. * | 0x0 |
  1395. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1396. *
  1397. * AAD Format with 64-bit Extended Sequence Number
  1398. *
  1399. * aadLen:
  1400. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  1401. * The code supports 16 too but for other sizes, the code will fail.
  1402. *
  1403. * TLen:
  1404. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  1405. * For other sizes, the code will fail.
  1406. *
  1407. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1408. ***************************************************************************/
  1409. ENTRY(aesni_gcm_enc)
  1410. push %r12
  1411. push %r13
  1412. push %r14
  1413. mov %rsp, %r14
  1414. #
  1415. # states of %xmm registers %xmm6:%xmm15 not saved
  1416. # all %xmm registers are clobbered
  1417. #
  1418. sub $VARIABLE_OFFSET, %rsp
  1419. and $~63, %rsp
  1420. mov %arg6, %r12
  1421. movdqu (%r12), %xmm13
  1422. movdqa SHUF_MASK(%rip), %xmm2
  1423. PSHUFB_XMM %xmm2, %xmm13
  1424. # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
  1425. movdqa %xmm13, %xmm2
  1426. psllq $1, %xmm13
  1427. psrlq $63, %xmm2
  1428. movdqa %xmm2, %xmm1
  1429. pslldq $8, %xmm2
  1430. psrldq $8, %xmm1
  1431. por %xmm2, %xmm13
  1432. # reduce HashKey<<1
  1433. pshufd $0x24, %xmm1, %xmm2
  1434. pcmpeqd TWOONE(%rip), %xmm2
  1435. pand POLY(%rip), %xmm2
  1436. pxor %xmm2, %xmm13
  1437. movdqa %xmm13, HashKey(%rsp)
  1438. mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
  1439. and $-16, %r13
  1440. mov %r13, %r12
  1441. # Encrypt first few blocks
  1442. and $(3<<4), %r12
  1443. jz _initial_num_blocks_is_0_encrypt
  1444. cmp $(2<<4), %r12
  1445. jb _initial_num_blocks_is_1_encrypt
  1446. je _initial_num_blocks_is_2_encrypt
  1447. _initial_num_blocks_is_3_encrypt:
  1448. INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1449. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
  1450. sub $48, %r13
  1451. jmp _initial_blocks_encrypted
  1452. _initial_num_blocks_is_2_encrypt:
  1453. INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1454. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
  1455. sub $32, %r13
  1456. jmp _initial_blocks_encrypted
  1457. _initial_num_blocks_is_1_encrypt:
  1458. INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1459. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
  1460. sub $16, %r13
  1461. jmp _initial_blocks_encrypted
  1462. _initial_num_blocks_is_0_encrypt:
  1463. INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1464. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
  1465. _initial_blocks_encrypted:
  1466. # Main loop - Encrypt remaining blocks
  1467. cmp $0, %r13
  1468. je _zero_cipher_left_encrypt
  1469. sub $64, %r13
  1470. je _four_cipher_left_encrypt
  1471. _encrypt_by_4_encrypt:
  1472. GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  1473. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
  1474. add $64, %r11
  1475. sub $64, %r13
  1476. jne _encrypt_by_4_encrypt
  1477. _four_cipher_left_encrypt:
  1478. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  1479. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  1480. _zero_cipher_left_encrypt:
  1481. mov %arg4, %r13
  1482. and $15, %r13 # %r13 = arg4 (mod 16)
  1483. je _multiple_of_16_bytes_encrypt
  1484. # Handle the last <16 Byte block separately
  1485. paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
  1486. movdqa SHUF_MASK(%rip), %xmm10
  1487. PSHUFB_XMM %xmm10, %xmm0
  1488. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
  1489. sub $16, %r11
  1490. add %r13, %r11
  1491. movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
  1492. lea SHIFT_MASK+16(%rip), %r12
  1493. sub %r13, %r12
  1494. # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
  1495. # (%r13 is the number of bytes in plaintext mod 16)
  1496. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1497. PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
  1498. pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
  1499. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  1500. # get the appropriate mask to mask out top 16-r13 bytes of xmm0
  1501. pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
  1502. movdqa SHUF_MASK(%rip), %xmm10
  1503. PSHUFB_XMM %xmm10,%xmm0
  1504. pxor %xmm0, %xmm8
  1505. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1506. # GHASH computation for the last <16 byte block
  1507. sub %r13, %r11
  1508. add $16, %r11
  1509. movdqa SHUF_MASK(%rip), %xmm10
  1510. PSHUFB_XMM %xmm10, %xmm0
  1511. # shuffle xmm0 back to output as ciphertext
  1512. # Output %r13 bytes
  1513. MOVQ_R64_XMM %xmm0, %rax
  1514. cmp $8, %r13
  1515. jle _less_than_8_bytes_left_encrypt
  1516. mov %rax, (%arg2 , %r11, 1)
  1517. add $8, %r11
  1518. psrldq $8, %xmm0
  1519. MOVQ_R64_XMM %xmm0, %rax
  1520. sub $8, %r13
  1521. _less_than_8_bytes_left_encrypt:
  1522. mov %al, (%arg2, %r11, 1)
  1523. add $1, %r11
  1524. shr $8, %rax
  1525. sub $1, %r13
  1526. jne _less_than_8_bytes_left_encrypt
  1527. _multiple_of_16_bytes_encrypt:
  1528. mov arg8, %r12 # %r12 = addLen (number of bytes)
  1529. shl $3, %r12
  1530. movd %r12d, %xmm15 # len(A) in %xmm15
  1531. shl $3, %arg4 # len(C) in bits (*128)
  1532. MOVQ_R64_XMM %arg4, %xmm1
  1533. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  1534. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  1535. pxor %xmm15, %xmm8
  1536. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1537. # final GHASH computation
  1538. movdqa SHUF_MASK(%rip), %xmm10
  1539. PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
  1540. mov %arg5, %rax # %rax = *Y0
  1541. movdqu (%rax), %xmm0 # %xmm0 = Y0
  1542. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
  1543. pxor %xmm8, %xmm0
  1544. _return_T_encrypt:
  1545. mov arg9, %r10 # %r10 = authTag
  1546. mov arg10, %r11 # %r11 = auth_tag_len
  1547. cmp $16, %r11
  1548. je _T_16_encrypt
  1549. cmp $12, %r11
  1550. je _T_12_encrypt
  1551. _T_8_encrypt:
  1552. MOVQ_R64_XMM %xmm0, %rax
  1553. mov %rax, (%r10)
  1554. jmp _return_T_done_encrypt
  1555. _T_12_encrypt:
  1556. MOVQ_R64_XMM %xmm0, %rax
  1557. mov %rax, (%r10)
  1558. psrldq $8, %xmm0
  1559. movd %xmm0, %eax
  1560. mov %eax, 8(%r10)
  1561. jmp _return_T_done_encrypt
  1562. _T_16_encrypt:
  1563. movdqu %xmm0, (%r10)
  1564. _return_T_done_encrypt:
  1565. mov %r14, %rsp
  1566. pop %r14
  1567. pop %r13
  1568. pop %r12
  1569. ret
  1570. ENDPROC(aesni_gcm_enc)
  1571. #endif
  1572. .align 4
  1573. _key_expansion_128:
  1574. _key_expansion_256a:
  1575. pshufd $0b11111111, %xmm1, %xmm1
  1576. shufps $0b00010000, %xmm0, %xmm4
  1577. pxor %xmm4, %xmm0
  1578. shufps $0b10001100, %xmm0, %xmm4
  1579. pxor %xmm4, %xmm0
  1580. pxor %xmm1, %xmm0
  1581. movaps %xmm0, (TKEYP)
  1582. add $0x10, TKEYP
  1583. ret
  1584. ENDPROC(_key_expansion_128)
  1585. ENDPROC(_key_expansion_256a)
  1586. .align 4
  1587. _key_expansion_192a:
  1588. pshufd $0b01010101, %xmm1, %xmm1
  1589. shufps $0b00010000, %xmm0, %xmm4
  1590. pxor %xmm4, %xmm0
  1591. shufps $0b10001100, %xmm0, %xmm4
  1592. pxor %xmm4, %xmm0
  1593. pxor %xmm1, %xmm0
  1594. movaps %xmm2, %xmm5
  1595. movaps %xmm2, %xmm6
  1596. pslldq $4, %xmm5
  1597. pshufd $0b11111111, %xmm0, %xmm3
  1598. pxor %xmm3, %xmm2
  1599. pxor %xmm5, %xmm2
  1600. movaps %xmm0, %xmm1
  1601. shufps $0b01000100, %xmm0, %xmm6
  1602. movaps %xmm6, (TKEYP)
  1603. shufps $0b01001110, %xmm2, %xmm1
  1604. movaps %xmm1, 0x10(TKEYP)
  1605. add $0x20, TKEYP
  1606. ret
  1607. ENDPROC(_key_expansion_192a)
  1608. .align 4
  1609. _key_expansion_192b:
  1610. pshufd $0b01010101, %xmm1, %xmm1
  1611. shufps $0b00010000, %xmm0, %xmm4
  1612. pxor %xmm4, %xmm0
  1613. shufps $0b10001100, %xmm0, %xmm4
  1614. pxor %xmm4, %xmm0
  1615. pxor %xmm1, %xmm0
  1616. movaps %xmm2, %xmm5
  1617. pslldq $4, %xmm5
  1618. pshufd $0b11111111, %xmm0, %xmm3
  1619. pxor %xmm3, %xmm2
  1620. pxor %xmm5, %xmm2
  1621. movaps %xmm0, (TKEYP)
  1622. add $0x10, TKEYP
  1623. ret
  1624. ENDPROC(_key_expansion_192b)
  1625. .align 4
  1626. _key_expansion_256b:
  1627. pshufd $0b10101010, %xmm1, %xmm1
  1628. shufps $0b00010000, %xmm2, %xmm4
  1629. pxor %xmm4, %xmm2
  1630. shufps $0b10001100, %xmm2, %xmm4
  1631. pxor %xmm4, %xmm2
  1632. pxor %xmm1, %xmm2
  1633. movaps %xmm2, (TKEYP)
  1634. add $0x10, TKEYP
  1635. ret
  1636. ENDPROC(_key_expansion_256b)
  1637. /*
  1638. * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
  1639. * unsigned int key_len)
  1640. */
  1641. ENTRY(aesni_set_key)
  1642. #ifndef __x86_64__
  1643. pushl KEYP
  1644. movl 8(%esp), KEYP # ctx
  1645. movl 12(%esp), UKEYP # in_key
  1646. movl 16(%esp), %edx # key_len
  1647. #endif
  1648. movups (UKEYP), %xmm0 # user key (first 16 bytes)
  1649. movaps %xmm0, (KEYP)
  1650. lea 0x10(KEYP), TKEYP # key addr
  1651. movl %edx, 480(KEYP)
  1652. pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
  1653. cmp $24, %dl
  1654. jb .Lenc_key128
  1655. je .Lenc_key192
  1656. movups 0x10(UKEYP), %xmm2 # other user key
  1657. movaps %xmm2, (TKEYP)
  1658. add $0x10, TKEYP
  1659. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1660. call _key_expansion_256a
  1661. AESKEYGENASSIST 0x1 %xmm0 %xmm1
  1662. call _key_expansion_256b
  1663. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1664. call _key_expansion_256a
  1665. AESKEYGENASSIST 0x2 %xmm0 %xmm1
  1666. call _key_expansion_256b
  1667. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1668. call _key_expansion_256a
  1669. AESKEYGENASSIST 0x4 %xmm0 %xmm1
  1670. call _key_expansion_256b
  1671. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1672. call _key_expansion_256a
  1673. AESKEYGENASSIST 0x8 %xmm0 %xmm1
  1674. call _key_expansion_256b
  1675. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1676. call _key_expansion_256a
  1677. AESKEYGENASSIST 0x10 %xmm0 %xmm1
  1678. call _key_expansion_256b
  1679. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1680. call _key_expansion_256a
  1681. AESKEYGENASSIST 0x20 %xmm0 %xmm1
  1682. call _key_expansion_256b
  1683. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1684. call _key_expansion_256a
  1685. jmp .Ldec_key
  1686. .Lenc_key192:
  1687. movq 0x10(UKEYP), %xmm2 # other user key
  1688. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1689. call _key_expansion_192a
  1690. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1691. call _key_expansion_192b
  1692. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1693. call _key_expansion_192a
  1694. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1695. call _key_expansion_192b
  1696. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1697. call _key_expansion_192a
  1698. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1699. call _key_expansion_192b
  1700. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1701. call _key_expansion_192a
  1702. AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
  1703. call _key_expansion_192b
  1704. jmp .Ldec_key
  1705. .Lenc_key128:
  1706. AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
  1707. call _key_expansion_128
  1708. AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
  1709. call _key_expansion_128
  1710. AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
  1711. call _key_expansion_128
  1712. AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
  1713. call _key_expansion_128
  1714. AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
  1715. call _key_expansion_128
  1716. AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
  1717. call _key_expansion_128
  1718. AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
  1719. call _key_expansion_128
  1720. AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
  1721. call _key_expansion_128
  1722. AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
  1723. call _key_expansion_128
  1724. AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
  1725. call _key_expansion_128
  1726. .Ldec_key:
  1727. sub $0x10, TKEYP
  1728. movaps (KEYP), %xmm0
  1729. movaps (TKEYP), %xmm1
  1730. movaps %xmm0, 240(TKEYP)
  1731. movaps %xmm1, 240(KEYP)
  1732. add $0x10, KEYP
  1733. lea 240-16(TKEYP), UKEYP
  1734. .align 4
  1735. .Ldec_key_loop:
  1736. movaps (KEYP), %xmm0
  1737. AESIMC %xmm0 %xmm1
  1738. movaps %xmm1, (UKEYP)
  1739. add $0x10, KEYP
  1740. sub $0x10, UKEYP
  1741. cmp TKEYP, KEYP
  1742. jb .Ldec_key_loop
  1743. xor AREG, AREG
  1744. #ifndef __x86_64__
  1745. popl KEYP
  1746. #endif
  1747. ret
  1748. ENDPROC(aesni_set_key)
  1749. /*
  1750. * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1751. */
  1752. ENTRY(aesni_enc)
  1753. #ifndef __x86_64__
  1754. pushl KEYP
  1755. pushl KLEN
  1756. movl 12(%esp), KEYP
  1757. movl 16(%esp), OUTP
  1758. movl 20(%esp), INP
  1759. #endif
  1760. movl 480(KEYP), KLEN # key length
  1761. movups (INP), STATE # input
  1762. call _aesni_enc1
  1763. movups STATE, (OUTP) # output
  1764. #ifndef __x86_64__
  1765. popl KLEN
  1766. popl KEYP
  1767. #endif
  1768. ret
  1769. ENDPROC(aesni_enc)
  1770. /*
  1771. * _aesni_enc1: internal ABI
  1772. * input:
  1773. * KEYP: key struct pointer
  1774. * KLEN: round count
  1775. * STATE: initial state (input)
  1776. * output:
  1777. * STATE: finial state (output)
  1778. * changed:
  1779. * KEY
  1780. * TKEYP (T1)
  1781. */
  1782. .align 4
  1783. _aesni_enc1:
  1784. movaps (KEYP), KEY # key
  1785. mov KEYP, TKEYP
  1786. pxor KEY, STATE # round 0
  1787. add $0x30, TKEYP
  1788. cmp $24, KLEN
  1789. jb .Lenc128
  1790. lea 0x20(TKEYP), TKEYP
  1791. je .Lenc192
  1792. add $0x20, TKEYP
  1793. movaps -0x60(TKEYP), KEY
  1794. AESENC KEY STATE
  1795. movaps -0x50(TKEYP), KEY
  1796. AESENC KEY STATE
  1797. .align 4
  1798. .Lenc192:
  1799. movaps -0x40(TKEYP), KEY
  1800. AESENC KEY STATE
  1801. movaps -0x30(TKEYP), KEY
  1802. AESENC KEY STATE
  1803. .align 4
  1804. .Lenc128:
  1805. movaps -0x20(TKEYP), KEY
  1806. AESENC KEY STATE
  1807. movaps -0x10(TKEYP), KEY
  1808. AESENC KEY STATE
  1809. movaps (TKEYP), KEY
  1810. AESENC KEY STATE
  1811. movaps 0x10(TKEYP), KEY
  1812. AESENC KEY STATE
  1813. movaps 0x20(TKEYP), KEY
  1814. AESENC KEY STATE
  1815. movaps 0x30(TKEYP), KEY
  1816. AESENC KEY STATE
  1817. movaps 0x40(TKEYP), KEY
  1818. AESENC KEY STATE
  1819. movaps 0x50(TKEYP), KEY
  1820. AESENC KEY STATE
  1821. movaps 0x60(TKEYP), KEY
  1822. AESENC KEY STATE
  1823. movaps 0x70(TKEYP), KEY
  1824. AESENCLAST KEY STATE
  1825. ret
  1826. ENDPROC(_aesni_enc1)
  1827. /*
  1828. * _aesni_enc4: internal ABI
  1829. * input:
  1830. * KEYP: key struct pointer
  1831. * KLEN: round count
  1832. * STATE1: initial state (input)
  1833. * STATE2
  1834. * STATE3
  1835. * STATE4
  1836. * output:
  1837. * STATE1: finial state (output)
  1838. * STATE2
  1839. * STATE3
  1840. * STATE4
  1841. * changed:
  1842. * KEY
  1843. * TKEYP (T1)
  1844. */
  1845. .align 4
  1846. _aesni_enc4:
  1847. movaps (KEYP), KEY # key
  1848. mov KEYP, TKEYP
  1849. pxor KEY, STATE1 # round 0
  1850. pxor KEY, STATE2
  1851. pxor KEY, STATE3
  1852. pxor KEY, STATE4
  1853. add $0x30, TKEYP
  1854. cmp $24, KLEN
  1855. jb .L4enc128
  1856. lea 0x20(TKEYP), TKEYP
  1857. je .L4enc192
  1858. add $0x20, TKEYP
  1859. movaps -0x60(TKEYP), KEY
  1860. AESENC KEY STATE1
  1861. AESENC KEY STATE2
  1862. AESENC KEY STATE3
  1863. AESENC KEY STATE4
  1864. movaps -0x50(TKEYP), KEY
  1865. AESENC KEY STATE1
  1866. AESENC KEY STATE2
  1867. AESENC KEY STATE3
  1868. AESENC KEY STATE4
  1869. #.align 4
  1870. .L4enc192:
  1871. movaps -0x40(TKEYP), KEY
  1872. AESENC KEY STATE1
  1873. AESENC KEY STATE2
  1874. AESENC KEY STATE3
  1875. AESENC KEY STATE4
  1876. movaps -0x30(TKEYP), KEY
  1877. AESENC KEY STATE1
  1878. AESENC KEY STATE2
  1879. AESENC KEY STATE3
  1880. AESENC KEY STATE4
  1881. #.align 4
  1882. .L4enc128:
  1883. movaps -0x20(TKEYP), KEY
  1884. AESENC KEY STATE1
  1885. AESENC KEY STATE2
  1886. AESENC KEY STATE3
  1887. AESENC KEY STATE4
  1888. movaps -0x10(TKEYP), KEY
  1889. AESENC KEY STATE1
  1890. AESENC KEY STATE2
  1891. AESENC KEY STATE3
  1892. AESENC KEY STATE4
  1893. movaps (TKEYP), KEY
  1894. AESENC KEY STATE1
  1895. AESENC KEY STATE2
  1896. AESENC KEY STATE3
  1897. AESENC KEY STATE4
  1898. movaps 0x10(TKEYP), KEY
  1899. AESENC KEY STATE1
  1900. AESENC KEY STATE2
  1901. AESENC KEY STATE3
  1902. AESENC KEY STATE4
  1903. movaps 0x20(TKEYP), KEY
  1904. AESENC KEY STATE1
  1905. AESENC KEY STATE2
  1906. AESENC KEY STATE3
  1907. AESENC KEY STATE4
  1908. movaps 0x30(TKEYP), KEY
  1909. AESENC KEY STATE1
  1910. AESENC KEY STATE2
  1911. AESENC KEY STATE3
  1912. AESENC KEY STATE4
  1913. movaps 0x40(TKEYP), KEY
  1914. AESENC KEY STATE1
  1915. AESENC KEY STATE2
  1916. AESENC KEY STATE3
  1917. AESENC KEY STATE4
  1918. movaps 0x50(TKEYP), KEY
  1919. AESENC KEY STATE1
  1920. AESENC KEY STATE2
  1921. AESENC KEY STATE3
  1922. AESENC KEY STATE4
  1923. movaps 0x60(TKEYP), KEY
  1924. AESENC KEY STATE1
  1925. AESENC KEY STATE2
  1926. AESENC KEY STATE3
  1927. AESENC KEY STATE4
  1928. movaps 0x70(TKEYP), KEY
  1929. AESENCLAST KEY STATE1 # last round
  1930. AESENCLAST KEY STATE2
  1931. AESENCLAST KEY STATE3
  1932. AESENCLAST KEY STATE4
  1933. ret
  1934. ENDPROC(_aesni_enc4)
  1935. /*
  1936. * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1937. */
  1938. ENTRY(aesni_dec)
  1939. #ifndef __x86_64__
  1940. pushl KEYP
  1941. pushl KLEN
  1942. movl 12(%esp), KEYP
  1943. movl 16(%esp), OUTP
  1944. movl 20(%esp), INP
  1945. #endif
  1946. mov 480(KEYP), KLEN # key length
  1947. add $240, KEYP
  1948. movups (INP), STATE # input
  1949. call _aesni_dec1
  1950. movups STATE, (OUTP) #output
  1951. #ifndef __x86_64__
  1952. popl KLEN
  1953. popl KEYP
  1954. #endif
  1955. ret
  1956. ENDPROC(aesni_dec)
  1957. /*
  1958. * _aesni_dec1: internal ABI
  1959. * input:
  1960. * KEYP: key struct pointer
  1961. * KLEN: key length
  1962. * STATE: initial state (input)
  1963. * output:
  1964. * STATE: finial state (output)
  1965. * changed:
  1966. * KEY
  1967. * TKEYP (T1)
  1968. */
  1969. .align 4
  1970. _aesni_dec1:
  1971. movaps (KEYP), KEY # key
  1972. mov KEYP, TKEYP
  1973. pxor KEY, STATE # round 0
  1974. add $0x30, TKEYP
  1975. cmp $24, KLEN
  1976. jb .Ldec128
  1977. lea 0x20(TKEYP), TKEYP
  1978. je .Ldec192
  1979. add $0x20, TKEYP
  1980. movaps -0x60(TKEYP), KEY
  1981. AESDEC KEY STATE
  1982. movaps -0x50(TKEYP), KEY
  1983. AESDEC KEY STATE
  1984. .align 4
  1985. .Ldec192:
  1986. movaps -0x40(TKEYP), KEY
  1987. AESDEC KEY STATE
  1988. movaps -0x30(TKEYP), KEY
  1989. AESDEC KEY STATE
  1990. .align 4
  1991. .Ldec128:
  1992. movaps -0x20(TKEYP), KEY
  1993. AESDEC KEY STATE
  1994. movaps -0x10(TKEYP), KEY
  1995. AESDEC KEY STATE
  1996. movaps (TKEYP), KEY
  1997. AESDEC KEY STATE
  1998. movaps 0x10(TKEYP), KEY
  1999. AESDEC KEY STATE
  2000. movaps 0x20(TKEYP), KEY
  2001. AESDEC KEY STATE
  2002. movaps 0x30(TKEYP), KEY
  2003. AESDEC KEY STATE
  2004. movaps 0x40(TKEYP), KEY
  2005. AESDEC KEY STATE
  2006. movaps 0x50(TKEYP), KEY
  2007. AESDEC KEY STATE
  2008. movaps 0x60(TKEYP), KEY
  2009. AESDEC KEY STATE
  2010. movaps 0x70(TKEYP), KEY
  2011. AESDECLAST KEY STATE
  2012. ret
  2013. ENDPROC(_aesni_dec1)
  2014. /*
  2015. * _aesni_dec4: internal ABI
  2016. * input:
  2017. * KEYP: key struct pointer
  2018. * KLEN: key length
  2019. * STATE1: initial state (input)
  2020. * STATE2
  2021. * STATE3
  2022. * STATE4
  2023. * output:
  2024. * STATE1: finial state (output)
  2025. * STATE2
  2026. * STATE3
  2027. * STATE4
  2028. * changed:
  2029. * KEY
  2030. * TKEYP (T1)
  2031. */
  2032. .align 4
  2033. _aesni_dec4:
  2034. movaps (KEYP), KEY # key
  2035. mov KEYP, TKEYP
  2036. pxor KEY, STATE1 # round 0
  2037. pxor KEY, STATE2
  2038. pxor KEY, STATE3
  2039. pxor KEY, STATE4
  2040. add $0x30, TKEYP
  2041. cmp $24, KLEN
  2042. jb .L4dec128
  2043. lea 0x20(TKEYP), TKEYP
  2044. je .L4dec192
  2045. add $0x20, TKEYP
  2046. movaps -0x60(TKEYP), KEY
  2047. AESDEC KEY STATE1
  2048. AESDEC KEY STATE2
  2049. AESDEC KEY STATE3
  2050. AESDEC KEY STATE4
  2051. movaps -0x50(TKEYP), KEY
  2052. AESDEC KEY STATE1
  2053. AESDEC KEY STATE2
  2054. AESDEC KEY STATE3
  2055. AESDEC KEY STATE4
  2056. .align 4
  2057. .L4dec192:
  2058. movaps -0x40(TKEYP), KEY
  2059. AESDEC KEY STATE1
  2060. AESDEC KEY STATE2
  2061. AESDEC KEY STATE3
  2062. AESDEC KEY STATE4
  2063. movaps -0x30(TKEYP), KEY
  2064. AESDEC KEY STATE1
  2065. AESDEC KEY STATE2
  2066. AESDEC KEY STATE3
  2067. AESDEC KEY STATE4
  2068. .align 4
  2069. .L4dec128:
  2070. movaps -0x20(TKEYP), KEY
  2071. AESDEC KEY STATE1
  2072. AESDEC KEY STATE2
  2073. AESDEC KEY STATE3
  2074. AESDEC KEY STATE4
  2075. movaps -0x10(TKEYP), KEY
  2076. AESDEC KEY STATE1
  2077. AESDEC KEY STATE2
  2078. AESDEC KEY STATE3
  2079. AESDEC KEY STATE4
  2080. movaps (TKEYP), KEY
  2081. AESDEC KEY STATE1
  2082. AESDEC KEY STATE2
  2083. AESDEC KEY STATE3
  2084. AESDEC KEY STATE4
  2085. movaps 0x10(TKEYP), KEY
  2086. AESDEC KEY STATE1
  2087. AESDEC KEY STATE2
  2088. AESDEC KEY STATE3
  2089. AESDEC KEY STATE4
  2090. movaps 0x20(TKEYP), KEY
  2091. AESDEC KEY STATE1
  2092. AESDEC KEY STATE2
  2093. AESDEC KEY STATE3
  2094. AESDEC KEY STATE4
  2095. movaps 0x30(TKEYP), KEY
  2096. AESDEC KEY STATE1
  2097. AESDEC KEY STATE2
  2098. AESDEC KEY STATE3
  2099. AESDEC KEY STATE4
  2100. movaps 0x40(TKEYP), KEY
  2101. AESDEC KEY STATE1
  2102. AESDEC KEY STATE2
  2103. AESDEC KEY STATE3
  2104. AESDEC KEY STATE4
  2105. movaps 0x50(TKEYP), KEY
  2106. AESDEC KEY STATE1
  2107. AESDEC KEY STATE2
  2108. AESDEC KEY STATE3
  2109. AESDEC KEY STATE4
  2110. movaps 0x60(TKEYP), KEY
  2111. AESDEC KEY STATE1
  2112. AESDEC KEY STATE2
  2113. AESDEC KEY STATE3
  2114. AESDEC KEY STATE4
  2115. movaps 0x70(TKEYP), KEY
  2116. AESDECLAST KEY STATE1 # last round
  2117. AESDECLAST KEY STATE2
  2118. AESDECLAST KEY STATE3
  2119. AESDECLAST KEY STATE4
  2120. ret
  2121. ENDPROC(_aesni_dec4)
  2122. /*
  2123. * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2124. * size_t len)
  2125. */
  2126. ENTRY(aesni_ecb_enc)
  2127. #ifndef __x86_64__
  2128. pushl LEN
  2129. pushl KEYP
  2130. pushl KLEN
  2131. movl 16(%esp), KEYP
  2132. movl 20(%esp), OUTP
  2133. movl 24(%esp), INP
  2134. movl 28(%esp), LEN
  2135. #endif
  2136. test LEN, LEN # check length
  2137. jz .Lecb_enc_ret
  2138. mov 480(KEYP), KLEN
  2139. cmp $16, LEN
  2140. jb .Lecb_enc_ret
  2141. cmp $64, LEN
  2142. jb .Lecb_enc_loop1
  2143. .align 4
  2144. .Lecb_enc_loop4:
  2145. movups (INP), STATE1
  2146. movups 0x10(INP), STATE2
  2147. movups 0x20(INP), STATE3
  2148. movups 0x30(INP), STATE4
  2149. call _aesni_enc4
  2150. movups STATE1, (OUTP)
  2151. movups STATE2, 0x10(OUTP)
  2152. movups STATE3, 0x20(OUTP)
  2153. movups STATE4, 0x30(OUTP)
  2154. sub $64, LEN
  2155. add $64, INP
  2156. add $64, OUTP
  2157. cmp $64, LEN
  2158. jge .Lecb_enc_loop4
  2159. cmp $16, LEN
  2160. jb .Lecb_enc_ret
  2161. .align 4
  2162. .Lecb_enc_loop1:
  2163. movups (INP), STATE1
  2164. call _aesni_enc1
  2165. movups STATE1, (OUTP)
  2166. sub $16, LEN
  2167. add $16, INP
  2168. add $16, OUTP
  2169. cmp $16, LEN
  2170. jge .Lecb_enc_loop1
  2171. .Lecb_enc_ret:
  2172. #ifndef __x86_64__
  2173. popl KLEN
  2174. popl KEYP
  2175. popl LEN
  2176. #endif
  2177. ret
  2178. ENDPROC(aesni_ecb_enc)
  2179. /*
  2180. * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2181. * size_t len);
  2182. */
  2183. ENTRY(aesni_ecb_dec)
  2184. #ifndef __x86_64__
  2185. pushl LEN
  2186. pushl KEYP
  2187. pushl KLEN
  2188. movl 16(%esp), KEYP
  2189. movl 20(%esp), OUTP
  2190. movl 24(%esp), INP
  2191. movl 28(%esp), LEN
  2192. #endif
  2193. test LEN, LEN
  2194. jz .Lecb_dec_ret
  2195. mov 480(KEYP), KLEN
  2196. add $240, KEYP
  2197. cmp $16, LEN
  2198. jb .Lecb_dec_ret
  2199. cmp $64, LEN
  2200. jb .Lecb_dec_loop1
  2201. .align 4
  2202. .Lecb_dec_loop4:
  2203. movups (INP), STATE1
  2204. movups 0x10(INP), STATE2
  2205. movups 0x20(INP), STATE3
  2206. movups 0x30(INP), STATE4
  2207. call _aesni_dec4
  2208. movups STATE1, (OUTP)
  2209. movups STATE2, 0x10(OUTP)
  2210. movups STATE3, 0x20(OUTP)
  2211. movups STATE4, 0x30(OUTP)
  2212. sub $64, LEN
  2213. add $64, INP
  2214. add $64, OUTP
  2215. cmp $64, LEN
  2216. jge .Lecb_dec_loop4
  2217. cmp $16, LEN
  2218. jb .Lecb_dec_ret
  2219. .align 4
  2220. .Lecb_dec_loop1:
  2221. movups (INP), STATE1
  2222. call _aesni_dec1
  2223. movups STATE1, (OUTP)
  2224. sub $16, LEN
  2225. add $16, INP
  2226. add $16, OUTP
  2227. cmp $16, LEN
  2228. jge .Lecb_dec_loop1
  2229. .Lecb_dec_ret:
  2230. #ifndef __x86_64__
  2231. popl KLEN
  2232. popl KEYP
  2233. popl LEN
  2234. #endif
  2235. ret
  2236. ENDPROC(aesni_ecb_dec)
  2237. /*
  2238. * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2239. * size_t len, u8 *iv)
  2240. */
  2241. ENTRY(aesni_cbc_enc)
  2242. #ifndef __x86_64__
  2243. pushl IVP
  2244. pushl LEN
  2245. pushl KEYP
  2246. pushl KLEN
  2247. movl 20(%esp), KEYP
  2248. movl 24(%esp), OUTP
  2249. movl 28(%esp), INP
  2250. movl 32(%esp), LEN
  2251. movl 36(%esp), IVP
  2252. #endif
  2253. cmp $16, LEN
  2254. jb .Lcbc_enc_ret
  2255. mov 480(KEYP), KLEN
  2256. movups (IVP), STATE # load iv as initial state
  2257. .align 4
  2258. .Lcbc_enc_loop:
  2259. movups (INP), IN # load input
  2260. pxor IN, STATE
  2261. call _aesni_enc1
  2262. movups STATE, (OUTP) # store output
  2263. sub $16, LEN
  2264. add $16, INP
  2265. add $16, OUTP
  2266. cmp $16, LEN
  2267. jge .Lcbc_enc_loop
  2268. movups STATE, (IVP)
  2269. .Lcbc_enc_ret:
  2270. #ifndef __x86_64__
  2271. popl KLEN
  2272. popl KEYP
  2273. popl LEN
  2274. popl IVP
  2275. #endif
  2276. ret
  2277. ENDPROC(aesni_cbc_enc)
  2278. /*
  2279. * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2280. * size_t len, u8 *iv)
  2281. */
  2282. ENTRY(aesni_cbc_dec)
  2283. #ifndef __x86_64__
  2284. pushl IVP
  2285. pushl LEN
  2286. pushl KEYP
  2287. pushl KLEN
  2288. movl 20(%esp), KEYP
  2289. movl 24(%esp), OUTP
  2290. movl 28(%esp), INP
  2291. movl 32(%esp), LEN
  2292. movl 36(%esp), IVP
  2293. #endif
  2294. cmp $16, LEN
  2295. jb .Lcbc_dec_just_ret
  2296. mov 480(KEYP), KLEN
  2297. add $240, KEYP
  2298. movups (IVP), IV
  2299. cmp $64, LEN
  2300. jb .Lcbc_dec_loop1
  2301. .align 4
  2302. .Lcbc_dec_loop4:
  2303. movups (INP), IN1
  2304. movaps IN1, STATE1
  2305. movups 0x10(INP), IN2
  2306. movaps IN2, STATE2
  2307. #ifdef __x86_64__
  2308. movups 0x20(INP), IN3
  2309. movaps IN3, STATE3
  2310. movups 0x30(INP), IN4
  2311. movaps IN4, STATE4
  2312. #else
  2313. movups 0x20(INP), IN1
  2314. movaps IN1, STATE3
  2315. movups 0x30(INP), IN2
  2316. movaps IN2, STATE4
  2317. #endif
  2318. call _aesni_dec4
  2319. pxor IV, STATE1
  2320. #ifdef __x86_64__
  2321. pxor IN1, STATE2
  2322. pxor IN2, STATE3
  2323. pxor IN3, STATE4
  2324. movaps IN4, IV
  2325. #else
  2326. pxor IN1, STATE4
  2327. movaps IN2, IV
  2328. movups (INP), IN1
  2329. pxor IN1, STATE2
  2330. movups 0x10(INP), IN2
  2331. pxor IN2, STATE3
  2332. #endif
  2333. movups STATE1, (OUTP)
  2334. movups STATE2, 0x10(OUTP)
  2335. movups STATE3, 0x20(OUTP)
  2336. movups STATE4, 0x30(OUTP)
  2337. sub $64, LEN
  2338. add $64, INP
  2339. add $64, OUTP
  2340. cmp $64, LEN
  2341. jge .Lcbc_dec_loop4
  2342. cmp $16, LEN
  2343. jb .Lcbc_dec_ret
  2344. .align 4
  2345. .Lcbc_dec_loop1:
  2346. movups (INP), IN
  2347. movaps IN, STATE
  2348. call _aesni_dec1
  2349. pxor IV, STATE
  2350. movups STATE, (OUTP)
  2351. movaps IN, IV
  2352. sub $16, LEN
  2353. add $16, INP
  2354. add $16, OUTP
  2355. cmp $16, LEN
  2356. jge .Lcbc_dec_loop1
  2357. .Lcbc_dec_ret:
  2358. movups IV, (IVP)
  2359. .Lcbc_dec_just_ret:
  2360. #ifndef __x86_64__
  2361. popl KLEN
  2362. popl KEYP
  2363. popl LEN
  2364. popl IVP
  2365. #endif
  2366. ret
  2367. ENDPROC(aesni_cbc_dec)
  2368. #ifdef __x86_64__
  2369. .align 16
  2370. .Lbswap_mask:
  2371. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  2372. /*
  2373. * _aesni_inc_init: internal ABI
  2374. * setup registers used by _aesni_inc
  2375. * input:
  2376. * IV
  2377. * output:
  2378. * CTR: == IV, in little endian
  2379. * TCTR_LOW: == lower qword of CTR
  2380. * INC: == 1, in little endian
  2381. * BSWAP_MASK == endian swapping mask
  2382. */
  2383. .align 4
  2384. _aesni_inc_init:
  2385. movaps .Lbswap_mask, BSWAP_MASK
  2386. movaps IV, CTR
  2387. PSHUFB_XMM BSWAP_MASK CTR
  2388. mov $1, TCTR_LOW
  2389. MOVQ_R64_XMM TCTR_LOW INC
  2390. MOVQ_R64_XMM CTR TCTR_LOW
  2391. ret
  2392. ENDPROC(_aesni_inc_init)
  2393. /*
  2394. * _aesni_inc: internal ABI
  2395. * Increase IV by 1, IV is in big endian
  2396. * input:
  2397. * IV
  2398. * CTR: == IV, in little endian
  2399. * TCTR_LOW: == lower qword of CTR
  2400. * INC: == 1, in little endian
  2401. * BSWAP_MASK == endian swapping mask
  2402. * output:
  2403. * IV: Increase by 1
  2404. * changed:
  2405. * CTR: == output IV, in little endian
  2406. * TCTR_LOW: == lower qword of CTR
  2407. */
  2408. .align 4
  2409. _aesni_inc:
  2410. paddq INC, CTR
  2411. add $1, TCTR_LOW
  2412. jnc .Linc_low
  2413. pslldq $8, INC
  2414. paddq INC, CTR
  2415. psrldq $8, INC
  2416. .Linc_low:
  2417. movaps CTR, IV
  2418. PSHUFB_XMM BSWAP_MASK IV
  2419. ret
  2420. ENDPROC(_aesni_inc)
  2421. /*
  2422. * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2423. * size_t len, u8 *iv)
  2424. */
  2425. ENTRY(aesni_ctr_enc)
  2426. cmp $16, LEN
  2427. jb .Lctr_enc_just_ret
  2428. mov 480(KEYP), KLEN
  2429. movups (IVP), IV
  2430. call _aesni_inc_init
  2431. cmp $64, LEN
  2432. jb .Lctr_enc_loop1
  2433. .align 4
  2434. .Lctr_enc_loop4:
  2435. movaps IV, STATE1
  2436. call _aesni_inc
  2437. movups (INP), IN1
  2438. movaps IV, STATE2
  2439. call _aesni_inc
  2440. movups 0x10(INP), IN2
  2441. movaps IV, STATE3
  2442. call _aesni_inc
  2443. movups 0x20(INP), IN3
  2444. movaps IV, STATE4
  2445. call _aesni_inc
  2446. movups 0x30(INP), IN4
  2447. call _aesni_enc4
  2448. pxor IN1, STATE1
  2449. movups STATE1, (OUTP)
  2450. pxor IN2, STATE2
  2451. movups STATE2, 0x10(OUTP)
  2452. pxor IN3, STATE3
  2453. movups STATE3, 0x20(OUTP)
  2454. pxor IN4, STATE4
  2455. movups STATE4, 0x30(OUTP)
  2456. sub $64, LEN
  2457. add $64, INP
  2458. add $64, OUTP
  2459. cmp $64, LEN
  2460. jge .Lctr_enc_loop4
  2461. cmp $16, LEN
  2462. jb .Lctr_enc_ret
  2463. .align 4
  2464. .Lctr_enc_loop1:
  2465. movaps IV, STATE
  2466. call _aesni_inc
  2467. movups (INP), IN
  2468. call _aesni_enc1
  2469. pxor IN, STATE
  2470. movups STATE, (OUTP)
  2471. sub $16, LEN
  2472. add $16, INP
  2473. add $16, OUTP
  2474. cmp $16, LEN
  2475. jge .Lctr_enc_loop1
  2476. .Lctr_enc_ret:
  2477. movups IV, (IVP)
  2478. .Lctr_enc_just_ret:
  2479. ret
  2480. ENDPROC(aesni_ctr_enc)
  2481. /*
  2482. * _aesni_gf128mul_x_ble: internal ABI
  2483. * Multiply in GF(2^128) for XTS IVs
  2484. * input:
  2485. * IV: current IV
  2486. * GF128MUL_MASK == mask with 0x87 and 0x01
  2487. * output:
  2488. * IV: next IV
  2489. * changed:
  2490. * CTR: == temporary value
  2491. */
  2492. #define _aesni_gf128mul_x_ble() \
  2493. pshufd $0x13, IV, CTR; \
  2494. paddq IV, IV; \
  2495. psrad $31, CTR; \
  2496. pand GF128MUL_MASK, CTR; \
  2497. pxor CTR, IV;
  2498. /*
  2499. * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2500. * bool enc, u8 *iv)
  2501. */
  2502. ENTRY(aesni_xts_crypt8)
  2503. cmpb $0, %cl
  2504. movl $0, %ecx
  2505. movl $240, %r10d
  2506. leaq _aesni_enc4, %r11
  2507. leaq _aesni_dec4, %rax
  2508. cmovel %r10d, %ecx
  2509. cmoveq %rax, %r11
  2510. movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
  2511. movups (IVP), IV
  2512. mov 480(KEYP), KLEN
  2513. addq %rcx, KEYP
  2514. movdqa IV, STATE1
  2515. movdqu 0x00(INP), INC
  2516. pxor INC, STATE1
  2517. movdqu IV, 0x00(OUTP)
  2518. _aesni_gf128mul_x_ble()
  2519. movdqa IV, STATE2
  2520. movdqu 0x10(INP), INC
  2521. pxor INC, STATE2
  2522. movdqu IV, 0x10(OUTP)
  2523. _aesni_gf128mul_x_ble()
  2524. movdqa IV, STATE3
  2525. movdqu 0x20(INP), INC
  2526. pxor INC, STATE3
  2527. movdqu IV, 0x20(OUTP)
  2528. _aesni_gf128mul_x_ble()
  2529. movdqa IV, STATE4
  2530. movdqu 0x30(INP), INC
  2531. pxor INC, STATE4
  2532. movdqu IV, 0x30(OUTP)
  2533. call *%r11
  2534. movdqu 0x00(OUTP), INC
  2535. pxor INC, STATE1
  2536. movdqu STATE1, 0x00(OUTP)
  2537. _aesni_gf128mul_x_ble()
  2538. movdqa IV, STATE1
  2539. movdqu 0x40(INP), INC
  2540. pxor INC, STATE1
  2541. movdqu IV, 0x40(OUTP)
  2542. movdqu 0x10(OUTP), INC
  2543. pxor INC, STATE2
  2544. movdqu STATE2, 0x10(OUTP)
  2545. _aesni_gf128mul_x_ble()
  2546. movdqa IV, STATE2
  2547. movdqu 0x50(INP), INC
  2548. pxor INC, STATE2
  2549. movdqu IV, 0x50(OUTP)
  2550. movdqu 0x20(OUTP), INC
  2551. pxor INC, STATE3
  2552. movdqu STATE3, 0x20(OUTP)
  2553. _aesni_gf128mul_x_ble()
  2554. movdqa IV, STATE3
  2555. movdqu 0x60(INP), INC
  2556. pxor INC, STATE3
  2557. movdqu IV, 0x60(OUTP)
  2558. movdqu 0x30(OUTP), INC
  2559. pxor INC, STATE4
  2560. movdqu STATE4, 0x30(OUTP)
  2561. _aesni_gf128mul_x_ble()
  2562. movdqa IV, STATE4
  2563. movdqu 0x70(INP), INC
  2564. pxor INC, STATE4
  2565. movdqu IV, 0x70(OUTP)
  2566. _aesni_gf128mul_x_ble()
  2567. movups IV, (IVP)
  2568. call *%r11
  2569. movdqu 0x40(OUTP), INC
  2570. pxor INC, STATE1
  2571. movdqu STATE1, 0x40(OUTP)
  2572. movdqu 0x50(OUTP), INC
  2573. pxor INC, STATE2
  2574. movdqu STATE2, 0x50(OUTP)
  2575. movdqu 0x60(OUTP), INC
  2576. pxor INC, STATE3
  2577. movdqu STATE3, 0x60(OUTP)
  2578. movdqu 0x70(OUTP), INC
  2579. pxor INC, STATE4
  2580. movdqu STATE4, 0x70(OUTP)
  2581. ret
  2582. ENDPROC(aesni_xts_crypt8)
  2583. #endif