aesni-intel_avx-x86_64.S 99 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827
  1. ########################################################################
  2. # Copyright (c) 2013, Intel Corporation
  3. #
  4. # This software is available to you under a choice of one of two
  5. # licenses. You may choose to be licensed under the terms of the GNU
  6. # General Public License (GPL) Version 2, available from the file
  7. # COPYING in the main directory of this source tree, or the
  8. # OpenIB.org BSD license below:
  9. #
  10. # Redistribution and use in source and binary forms, with or without
  11. # modification, are permitted provided that the following conditions are
  12. # met:
  13. #
  14. # * Redistributions of source code must retain the above copyright
  15. # notice, this list of conditions and the following disclaimer.
  16. #
  17. # * Redistributions in binary form must reproduce the above copyright
  18. # notice, this list of conditions and the following disclaimer in the
  19. # documentation and/or other materials provided with the
  20. # distribution.
  21. #
  22. # * Neither the name of the Intel Corporation nor the names of its
  23. # contributors may be used to endorse or promote products derived from
  24. # this software without specific prior written permission.
  25. #
  26. #
  27. # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  28. # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  30. # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  31. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  32. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  33. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
  34. # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  35. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  36. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  37. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. ########################################################################
  39. ##
  40. ## Authors:
  41. ## Erdinc Ozturk <erdinc.ozturk@intel.com>
  42. ## Vinodh Gopal <vinodh.gopal@intel.com>
  43. ## James Guilford <james.guilford@intel.com>
  44. ## Tim Chen <tim.c.chen@linux.intel.com>
  45. ##
  46. ## References:
  47. ## This code was derived and highly optimized from the code described in paper:
  48. ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
  49. ## on Intel Architecture Processors. August, 2010
  50. ## The details of the implementation is explained in:
  51. ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
  52. ## on Intel Architecture Processors. October, 2012.
  53. ##
  54. ## Assumptions:
  55. ##
  56. ##
  57. ##
  58. ## iv:
  59. ## 0 1 2 3
  60. ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  61. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  62. ## | Salt (From the SA) |
  63. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  64. ## | Initialization Vector |
  65. ## | (This is the sequence number from IPSec header) |
  66. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  67. ## | 0x1 |
  68. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  69. ##
  70. ##
  71. ##
  72. ## AAD:
  73. ## AAD padded to 128 bits with 0
  74. ## for example, assume AAD is a u32 vector
  75. ##
  76. ## if AAD is 8 bytes:
  77. ## AAD[3] = {A0, A1}#
  78. ## padded AAD in xmm register = {A1 A0 0 0}
  79. ##
  80. ## 0 1 2 3
  81. ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  82. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  83. ## | SPI (A1) |
  84. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  85. ## | 32-bit Sequence Number (A0) |
  86. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  87. ## | 0x0 |
  88. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  89. ##
  90. ## AAD Format with 32-bit Sequence Number
  91. ##
  92. ## if AAD is 12 bytes:
  93. ## AAD[3] = {A0, A1, A2}#
  94. ## padded AAD in xmm register = {A2 A1 A0 0}
  95. ##
  96. ## 0 1 2 3
  97. ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  98. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  99. ## | SPI (A2) |
  100. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  101. ## | 64-bit Extended Sequence Number {A1,A0} |
  102. ## | |
  103. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  104. ## | 0x0 |
  105. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  106. ##
  107. ## AAD Format with 64-bit Extended Sequence Number
  108. ##
  109. ##
  110. ## aadLen:
  111. ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
  112. ## The code additionally supports aadLen of length 16 bytes.
  113. ##
  114. ## TLen:
  115. ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  116. ##
  117. ## poly = x^128 + x^127 + x^126 + x^121 + 1
  118. ## throughout the code, one tab and two tab indentations are used. one tab is
  119. ## for GHASH part, two tabs is for AES part.
  120. ##
  121. #include <linux/linkage.h>
  122. #include <asm/inst.h>
  123. # constants in mergeable sections, linker can reorder and merge
  124. .section .rodata.cst16.POLY, "aM", @progbits, 16
  125. .align 16
  126. POLY: .octa 0xC2000000000000000000000000000001
  127. .section .rodata.cst16.POLY2, "aM", @progbits, 16
  128. .align 16
  129. POLY2: .octa 0xC20000000000000000000001C2000000
  130. .section .rodata.cst16.TWOONE, "aM", @progbits, 16
  131. .align 16
  132. TWOONE: .octa 0x00000001000000000000000000000001
  133. .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  134. .align 16
  135. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  136. .section .rodata.cst16.ONE, "aM", @progbits, 16
  137. .align 16
  138. ONE: .octa 0x00000000000000000000000000000001
  139. .section .rodata.cst16.ONEf, "aM", @progbits, 16
  140. .align 16
  141. ONEf: .octa 0x01000000000000000000000000000000
  142. # order of these constants should not change.
  143. # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
  144. .section .rodata, "a", @progbits
  145. .align 16
  146. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  147. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  148. .octa 0x00000000000000000000000000000000
  149. .text
  150. ##define the fields of the gcm aes context
  151. #{
  152. # u8 expanded_keys[16*11] store expanded keys
  153. # u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
  154. # u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
  155. # u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
  156. # u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
  157. # u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
  158. # u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
  159. # u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
  160. # u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
  161. # u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
  162. # u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
  163. # u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
  164. # u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
  165. # u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
  166. # u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
  167. # u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
  168. # u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
  169. #} gcm_ctx#
  170. HashKey = 16*11 # store HashKey <<1 mod poly here
  171. HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
  172. HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
  173. HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
  174. HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
  175. HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
  176. HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
  177. HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
  178. HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
  179. HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
  180. HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
  181. HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
  182. HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
  183. HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
  184. HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
  185. HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
  186. #define arg1 %rdi
  187. #define arg2 %rsi
  188. #define arg3 %rdx
  189. #define arg4 %rcx
  190. #define arg5 %r8
  191. #define arg6 %r9
  192. #define arg7 STACK_OFFSET+8*1(%r14)
  193. #define arg8 STACK_OFFSET+8*2(%r14)
  194. #define arg9 STACK_OFFSET+8*3(%r14)
  195. i = 0
  196. j = 0
  197. out_order = 0
  198. in_order = 1
  199. DEC = 0
  200. ENC = 1
  201. .macro define_reg r n
  202. reg_\r = %xmm\n
  203. .endm
  204. .macro setreg
  205. .altmacro
  206. define_reg i %i
  207. define_reg j %j
  208. .noaltmacro
  209. .endm
  210. # need to push 4 registers into stack to maintain
  211. STACK_OFFSET = 8*4
  212. TMP1 = 16*0 # Temporary storage for AAD
  213. TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
  214. TMP3 = 16*2 # Temporary storage for AES State 3
  215. TMP4 = 16*3 # Temporary storage for AES State 4
  216. TMP5 = 16*4 # Temporary storage for AES State 5
  217. TMP6 = 16*5 # Temporary storage for AES State 6
  218. TMP7 = 16*6 # Temporary storage for AES State 7
  219. TMP8 = 16*7 # Temporary storage for AES State 8
  220. VARIABLE_OFFSET = 16*8
  221. ################################
  222. # Utility Macros
  223. ################################
  224. # Encryption of a single block
  225. .macro ENCRYPT_SINGLE_BLOCK XMM0
  226. vpxor (arg1), \XMM0, \XMM0
  227. i = 1
  228. setreg
  229. .rep 9
  230. vaesenc 16*i(arg1), \XMM0, \XMM0
  231. i = (i+1)
  232. setreg
  233. .endr
  234. vaesenclast 16*10(arg1), \XMM0, \XMM0
  235. .endm
  236. #ifdef CONFIG_AS_AVX
  237. ###############################################################################
  238. # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  239. # Input: A and B (128-bits each, bit-reflected)
  240. # Output: C = A*B*x mod poly, (i.e. >>1 )
  241. # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  242. # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  243. ###############################################################################
  244. .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
  245. vpshufd $0b01001110, \GH, \T2
  246. vpshufd $0b01001110, \HK, \T3
  247. vpxor \GH , \T2, \T2 # T2 = (a1+a0)
  248. vpxor \HK , \T3, \T3 # T3 = (b1+b0)
  249. vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
  250. vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
  251. vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
  252. vpxor \GH, \T2,\T2
  253. vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
  254. vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
  255. vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
  256. vpxor \T3, \GH, \GH
  257. vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
  258. #first phase of the reduction
  259. vpslld $31, \GH, \T2 # packed right shifting << 31
  260. vpslld $30, \GH, \T3 # packed right shifting shift << 30
  261. vpslld $25, \GH, \T4 # packed right shifting shift << 25
  262. vpxor \T3, \T2, \T2 # xor the shifted versions
  263. vpxor \T4, \T2, \T2
  264. vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
  265. vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
  266. vpxor \T2, \GH, \GH # first phase of the reduction complete
  267. #second phase of the reduction
  268. vpsrld $1,\GH, \T2 # packed left shifting >> 1
  269. vpsrld $2,\GH, \T3 # packed left shifting >> 2
  270. vpsrld $7,\GH, \T4 # packed left shifting >> 7
  271. vpxor \T3, \T2, \T2 # xor the shifted versions
  272. vpxor \T4, \T2, \T2
  273. vpxor \T5, \T2, \T2
  274. vpxor \T2, \GH, \GH
  275. vpxor \T1, \GH, \GH # the result is in GH
  276. .endm
  277. .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
  278. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  279. vmovdqa \HK, \T5
  280. vpshufd $0b01001110, \T5, \T1
  281. vpxor \T5, \T1, \T1
  282. vmovdqa \T1, HashKey_k(arg1)
  283. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
  284. vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
  285. vpshufd $0b01001110, \T5, \T1
  286. vpxor \T5, \T1, \T1
  287. vmovdqa \T1, HashKey_2_k(arg1)
  288. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
  289. vmovdqa \T5, HashKey_3(arg1)
  290. vpshufd $0b01001110, \T5, \T1
  291. vpxor \T5, \T1, \T1
  292. vmovdqa \T1, HashKey_3_k(arg1)
  293. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
  294. vmovdqa \T5, HashKey_4(arg1)
  295. vpshufd $0b01001110, \T5, \T1
  296. vpxor \T5, \T1, \T1
  297. vmovdqa \T1, HashKey_4_k(arg1)
  298. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
  299. vmovdqa \T5, HashKey_5(arg1)
  300. vpshufd $0b01001110, \T5, \T1
  301. vpxor \T5, \T1, \T1
  302. vmovdqa \T1, HashKey_5_k(arg1)
  303. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
  304. vmovdqa \T5, HashKey_6(arg1)
  305. vpshufd $0b01001110, \T5, \T1
  306. vpxor \T5, \T1, \T1
  307. vmovdqa \T1, HashKey_6_k(arg1)
  308. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
  309. vmovdqa \T5, HashKey_7(arg1)
  310. vpshufd $0b01001110, \T5, \T1
  311. vpxor \T5, \T1, \T1
  312. vmovdqa \T1, HashKey_7_k(arg1)
  313. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
  314. vmovdqa \T5, HashKey_8(arg1)
  315. vpshufd $0b01001110, \T5, \T1
  316. vpxor \T5, \T1, \T1
  317. vmovdqa \T1, HashKey_8_k(arg1)
  318. .endm
  319. ## if a = number of total plaintext bytes
  320. ## b = floor(a/16)
  321. ## num_initial_blocks = b mod 4#
  322. ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
  323. ## r10, r11, r12, rax are clobbered
  324. ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
  325. .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
  326. i = (8-\num_initial_blocks)
  327. setreg
  328. mov arg6, %r10 # r10 = AAD
  329. mov arg7, %r12 # r12 = aadLen
  330. mov %r12, %r11
  331. vpxor reg_i, reg_i, reg_i
  332. _get_AAD_loop\@:
  333. vmovd (%r10), \T1
  334. vpslldq $12, \T1, \T1
  335. vpsrldq $4, reg_i, reg_i
  336. vpxor \T1, reg_i, reg_i
  337. add $4, %r10
  338. sub $4, %r12
  339. jg _get_AAD_loop\@
  340. cmp $16, %r11
  341. je _get_AAD_loop2_done\@
  342. mov $16, %r12
  343. _get_AAD_loop2\@:
  344. vpsrldq $4, reg_i, reg_i
  345. sub $4, %r12
  346. cmp %r11, %r12
  347. jg _get_AAD_loop2\@
  348. _get_AAD_loop2_done\@:
  349. #byte-reflect the AAD data
  350. vpshufb SHUF_MASK(%rip), reg_i, reg_i
  351. # initialize the data pointer offset as zero
  352. xor %r11, %r11
  353. # start AES for num_initial_blocks blocks
  354. mov arg5, %rax # rax = *Y0
  355. vmovdqu (%rax), \CTR # CTR = Y0
  356. vpshufb SHUF_MASK(%rip), \CTR, \CTR
  357. i = (9-\num_initial_blocks)
  358. setreg
  359. .rep \num_initial_blocks
  360. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  361. vmovdqa \CTR, reg_i
  362. vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
  363. i = (i+1)
  364. setreg
  365. .endr
  366. vmovdqa (arg1), \T_key
  367. i = (9-\num_initial_blocks)
  368. setreg
  369. .rep \num_initial_blocks
  370. vpxor \T_key, reg_i, reg_i
  371. i = (i+1)
  372. setreg
  373. .endr
  374. j = 1
  375. setreg
  376. .rep 9
  377. vmovdqa 16*j(arg1), \T_key
  378. i = (9-\num_initial_blocks)
  379. setreg
  380. .rep \num_initial_blocks
  381. vaesenc \T_key, reg_i, reg_i
  382. i = (i+1)
  383. setreg
  384. .endr
  385. j = (j+1)
  386. setreg
  387. .endr
  388. vmovdqa 16*10(arg1), \T_key
  389. i = (9-\num_initial_blocks)
  390. setreg
  391. .rep \num_initial_blocks
  392. vaesenclast \T_key, reg_i, reg_i
  393. i = (i+1)
  394. setreg
  395. .endr
  396. i = (9-\num_initial_blocks)
  397. setreg
  398. .rep \num_initial_blocks
  399. vmovdqu (arg3, %r11), \T1
  400. vpxor \T1, reg_i, reg_i
  401. vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
  402. add $16, %r11
  403. .if \ENC_DEC == DEC
  404. vmovdqa \T1, reg_i
  405. .endif
  406. vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
  407. i = (i+1)
  408. setreg
  409. .endr
  410. i = (8-\num_initial_blocks)
  411. j = (9-\num_initial_blocks)
  412. setreg
  413. GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
  414. .rep \num_initial_blocks
  415. vpxor reg_i, reg_j, reg_j
  416. GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
  417. i = (i+1)
  418. j = (j+1)
  419. setreg
  420. .endr
  421. # XMM8 has the combined result here
  422. vmovdqa \XMM8, TMP1(%rsp)
  423. vmovdqa \XMM8, \T3
  424. cmp $128, %r13
  425. jl _initial_blocks_done\@ # no need for precomputed constants
  426. ###############################################################################
  427. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  428. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  429. vmovdqa \CTR, \XMM1
  430. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  431. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  432. vmovdqa \CTR, \XMM2
  433. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  434. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  435. vmovdqa \CTR, \XMM3
  436. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  437. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  438. vmovdqa \CTR, \XMM4
  439. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  440. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  441. vmovdqa \CTR, \XMM5
  442. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  443. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  444. vmovdqa \CTR, \XMM6
  445. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  446. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  447. vmovdqa \CTR, \XMM7
  448. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  449. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  450. vmovdqa \CTR, \XMM8
  451. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  452. vmovdqa (arg1), \T_key
  453. vpxor \T_key, \XMM1, \XMM1
  454. vpxor \T_key, \XMM2, \XMM2
  455. vpxor \T_key, \XMM3, \XMM3
  456. vpxor \T_key, \XMM4, \XMM4
  457. vpxor \T_key, \XMM5, \XMM5
  458. vpxor \T_key, \XMM6, \XMM6
  459. vpxor \T_key, \XMM7, \XMM7
  460. vpxor \T_key, \XMM8, \XMM8
  461. i = 1
  462. setreg
  463. .rep 9 # do 9 rounds
  464. vmovdqa 16*i(arg1), \T_key
  465. vaesenc \T_key, \XMM1, \XMM1
  466. vaesenc \T_key, \XMM2, \XMM2
  467. vaesenc \T_key, \XMM3, \XMM3
  468. vaesenc \T_key, \XMM4, \XMM4
  469. vaesenc \T_key, \XMM5, \XMM5
  470. vaesenc \T_key, \XMM6, \XMM6
  471. vaesenc \T_key, \XMM7, \XMM7
  472. vaesenc \T_key, \XMM8, \XMM8
  473. i = (i+1)
  474. setreg
  475. .endr
  476. vmovdqa 16*i(arg1), \T_key
  477. vaesenclast \T_key, \XMM1, \XMM1
  478. vaesenclast \T_key, \XMM2, \XMM2
  479. vaesenclast \T_key, \XMM3, \XMM3
  480. vaesenclast \T_key, \XMM4, \XMM4
  481. vaesenclast \T_key, \XMM5, \XMM5
  482. vaesenclast \T_key, \XMM6, \XMM6
  483. vaesenclast \T_key, \XMM7, \XMM7
  484. vaesenclast \T_key, \XMM8, \XMM8
  485. vmovdqu (arg3, %r11), \T1
  486. vpxor \T1, \XMM1, \XMM1
  487. vmovdqu \XMM1, (arg2 , %r11)
  488. .if \ENC_DEC == DEC
  489. vmovdqa \T1, \XMM1
  490. .endif
  491. vmovdqu 16*1(arg3, %r11), \T1
  492. vpxor \T1, \XMM2, \XMM2
  493. vmovdqu \XMM2, 16*1(arg2 , %r11)
  494. .if \ENC_DEC == DEC
  495. vmovdqa \T1, \XMM2
  496. .endif
  497. vmovdqu 16*2(arg3, %r11), \T1
  498. vpxor \T1, \XMM3, \XMM3
  499. vmovdqu \XMM3, 16*2(arg2 , %r11)
  500. .if \ENC_DEC == DEC
  501. vmovdqa \T1, \XMM3
  502. .endif
  503. vmovdqu 16*3(arg3, %r11), \T1
  504. vpxor \T1, \XMM4, \XMM4
  505. vmovdqu \XMM4, 16*3(arg2 , %r11)
  506. .if \ENC_DEC == DEC
  507. vmovdqa \T1, \XMM4
  508. .endif
  509. vmovdqu 16*4(arg3, %r11), \T1
  510. vpxor \T1, \XMM5, \XMM5
  511. vmovdqu \XMM5, 16*4(arg2 , %r11)
  512. .if \ENC_DEC == DEC
  513. vmovdqa \T1, \XMM5
  514. .endif
  515. vmovdqu 16*5(arg3, %r11), \T1
  516. vpxor \T1, \XMM6, \XMM6
  517. vmovdqu \XMM6, 16*5(arg2 , %r11)
  518. .if \ENC_DEC == DEC
  519. vmovdqa \T1, \XMM6
  520. .endif
  521. vmovdqu 16*6(arg3, %r11), \T1
  522. vpxor \T1, \XMM7, \XMM7
  523. vmovdqu \XMM7, 16*6(arg2 , %r11)
  524. .if \ENC_DEC == DEC
  525. vmovdqa \T1, \XMM7
  526. .endif
  527. vmovdqu 16*7(arg3, %r11), \T1
  528. vpxor \T1, \XMM8, \XMM8
  529. vmovdqu \XMM8, 16*7(arg2 , %r11)
  530. .if \ENC_DEC == DEC
  531. vmovdqa \T1, \XMM8
  532. .endif
  533. add $128, %r11
  534. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  535. vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
  536. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  537. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  538. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  539. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  540. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  541. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  542. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  543. ###############################################################################
  544. _initial_blocks_done\@:
  545. .endm
  546. # encrypt 8 blocks at a time
  547. # ghash the 8 previously encrypted ciphertext blocks
  548. # arg1, arg2, arg3 are used as pointers only, not modified
  549. # r11 is the data offset value
  550. .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
  551. vmovdqa \XMM1, \T2
  552. vmovdqa \XMM2, TMP2(%rsp)
  553. vmovdqa \XMM3, TMP3(%rsp)
  554. vmovdqa \XMM4, TMP4(%rsp)
  555. vmovdqa \XMM5, TMP5(%rsp)
  556. vmovdqa \XMM6, TMP6(%rsp)
  557. vmovdqa \XMM7, TMP7(%rsp)
  558. vmovdqa \XMM8, TMP8(%rsp)
  559. .if \loop_idx == in_order
  560. vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
  561. vpaddd ONE(%rip), \XMM1, \XMM2
  562. vpaddd ONE(%rip), \XMM2, \XMM3
  563. vpaddd ONE(%rip), \XMM3, \XMM4
  564. vpaddd ONE(%rip), \XMM4, \XMM5
  565. vpaddd ONE(%rip), \XMM5, \XMM6
  566. vpaddd ONE(%rip), \XMM6, \XMM7
  567. vpaddd ONE(%rip), \XMM7, \XMM8
  568. vmovdqa \XMM8, \CTR
  569. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  570. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  571. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  572. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  573. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  574. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  575. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  576. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  577. .else
  578. vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
  579. vpaddd ONEf(%rip), \XMM1, \XMM2
  580. vpaddd ONEf(%rip), \XMM2, \XMM3
  581. vpaddd ONEf(%rip), \XMM3, \XMM4
  582. vpaddd ONEf(%rip), \XMM4, \XMM5
  583. vpaddd ONEf(%rip), \XMM5, \XMM6
  584. vpaddd ONEf(%rip), \XMM6, \XMM7
  585. vpaddd ONEf(%rip), \XMM7, \XMM8
  586. vmovdqa \XMM8, \CTR
  587. .endif
  588. #######################################################################
  589. vmovdqu (arg1), \T1
  590. vpxor \T1, \XMM1, \XMM1
  591. vpxor \T1, \XMM2, \XMM2
  592. vpxor \T1, \XMM3, \XMM3
  593. vpxor \T1, \XMM4, \XMM4
  594. vpxor \T1, \XMM5, \XMM5
  595. vpxor \T1, \XMM6, \XMM6
  596. vpxor \T1, \XMM7, \XMM7
  597. vpxor \T1, \XMM8, \XMM8
  598. #######################################################################
  599. vmovdqu 16*1(arg1), \T1
  600. vaesenc \T1, \XMM1, \XMM1
  601. vaesenc \T1, \XMM2, \XMM2
  602. vaesenc \T1, \XMM3, \XMM3
  603. vaesenc \T1, \XMM4, \XMM4
  604. vaesenc \T1, \XMM5, \XMM5
  605. vaesenc \T1, \XMM6, \XMM6
  606. vaesenc \T1, \XMM7, \XMM7
  607. vaesenc \T1, \XMM8, \XMM8
  608. vmovdqu 16*2(arg1), \T1
  609. vaesenc \T1, \XMM1, \XMM1
  610. vaesenc \T1, \XMM2, \XMM2
  611. vaesenc \T1, \XMM3, \XMM3
  612. vaesenc \T1, \XMM4, \XMM4
  613. vaesenc \T1, \XMM5, \XMM5
  614. vaesenc \T1, \XMM6, \XMM6
  615. vaesenc \T1, \XMM7, \XMM7
  616. vaesenc \T1, \XMM8, \XMM8
  617. #######################################################################
  618. vmovdqa HashKey_8(arg1), \T5
  619. vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
  620. vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
  621. vpshufd $0b01001110, \T2, \T6
  622. vpxor \T2, \T6, \T6
  623. vmovdqa HashKey_8_k(arg1), \T5
  624. vpclmulqdq $0x00, \T5, \T6, \T6
  625. vmovdqu 16*3(arg1), \T1
  626. vaesenc \T1, \XMM1, \XMM1
  627. vaesenc \T1, \XMM2, \XMM2
  628. vaesenc \T1, \XMM3, \XMM3
  629. vaesenc \T1, \XMM4, \XMM4
  630. vaesenc \T1, \XMM5, \XMM5
  631. vaesenc \T1, \XMM6, \XMM6
  632. vaesenc \T1, \XMM7, \XMM7
  633. vaesenc \T1, \XMM8, \XMM8
  634. vmovdqa TMP2(%rsp), \T1
  635. vmovdqa HashKey_7(arg1), \T5
  636. vpclmulqdq $0x11, \T5, \T1, \T3
  637. vpxor \T3, \T4, \T4
  638. vpclmulqdq $0x00, \T5, \T1, \T3
  639. vpxor \T3, \T7, \T7
  640. vpshufd $0b01001110, \T1, \T3
  641. vpxor \T1, \T3, \T3
  642. vmovdqa HashKey_7_k(arg1), \T5
  643. vpclmulqdq $0x10, \T5, \T3, \T3
  644. vpxor \T3, \T6, \T6
  645. vmovdqu 16*4(arg1), \T1
  646. vaesenc \T1, \XMM1, \XMM1
  647. vaesenc \T1, \XMM2, \XMM2
  648. vaesenc \T1, \XMM3, \XMM3
  649. vaesenc \T1, \XMM4, \XMM4
  650. vaesenc \T1, \XMM5, \XMM5
  651. vaesenc \T1, \XMM6, \XMM6
  652. vaesenc \T1, \XMM7, \XMM7
  653. vaesenc \T1, \XMM8, \XMM8
  654. #######################################################################
  655. vmovdqa TMP3(%rsp), \T1
  656. vmovdqa HashKey_6(arg1), \T5
  657. vpclmulqdq $0x11, \T5, \T1, \T3
  658. vpxor \T3, \T4, \T4
  659. vpclmulqdq $0x00, \T5, \T1, \T3
  660. vpxor \T3, \T7, \T7
  661. vpshufd $0b01001110, \T1, \T3
  662. vpxor \T1, \T3, \T3
  663. vmovdqa HashKey_6_k(arg1), \T5
  664. vpclmulqdq $0x10, \T5, \T3, \T3
  665. vpxor \T3, \T6, \T6
  666. vmovdqu 16*5(arg1), \T1
  667. vaesenc \T1, \XMM1, \XMM1
  668. vaesenc \T1, \XMM2, \XMM2
  669. vaesenc \T1, \XMM3, \XMM3
  670. vaesenc \T1, \XMM4, \XMM4
  671. vaesenc \T1, \XMM5, \XMM5
  672. vaesenc \T1, \XMM6, \XMM6
  673. vaesenc \T1, \XMM7, \XMM7
  674. vaesenc \T1, \XMM8, \XMM8
  675. vmovdqa TMP4(%rsp), \T1
  676. vmovdqa HashKey_5(arg1), \T5
  677. vpclmulqdq $0x11, \T5, \T1, \T3
  678. vpxor \T3, \T4, \T4
  679. vpclmulqdq $0x00, \T5, \T1, \T3
  680. vpxor \T3, \T7, \T7
  681. vpshufd $0b01001110, \T1, \T3
  682. vpxor \T1, \T3, \T3
  683. vmovdqa HashKey_5_k(arg1), \T5
  684. vpclmulqdq $0x10, \T5, \T3, \T3
  685. vpxor \T3, \T6, \T6
  686. vmovdqu 16*6(arg1), \T1
  687. vaesenc \T1, \XMM1, \XMM1
  688. vaesenc \T1, \XMM2, \XMM2
  689. vaesenc \T1, \XMM3, \XMM3
  690. vaesenc \T1, \XMM4, \XMM4
  691. vaesenc \T1, \XMM5, \XMM5
  692. vaesenc \T1, \XMM6, \XMM6
  693. vaesenc \T1, \XMM7, \XMM7
  694. vaesenc \T1, \XMM8, \XMM8
  695. vmovdqa TMP5(%rsp), \T1
  696. vmovdqa HashKey_4(arg1), \T5
  697. vpclmulqdq $0x11, \T5, \T1, \T3
  698. vpxor \T3, \T4, \T4
  699. vpclmulqdq $0x00, \T5, \T1, \T3
  700. vpxor \T3, \T7, \T7
  701. vpshufd $0b01001110, \T1, \T3
  702. vpxor \T1, \T3, \T3
  703. vmovdqa HashKey_4_k(arg1), \T5
  704. vpclmulqdq $0x10, \T5, \T3, \T3
  705. vpxor \T3, \T6, \T6
  706. vmovdqu 16*7(arg1), \T1
  707. vaesenc \T1, \XMM1, \XMM1
  708. vaesenc \T1, \XMM2, \XMM2
  709. vaesenc \T1, \XMM3, \XMM3
  710. vaesenc \T1, \XMM4, \XMM4
  711. vaesenc \T1, \XMM5, \XMM5
  712. vaesenc \T1, \XMM6, \XMM6
  713. vaesenc \T1, \XMM7, \XMM7
  714. vaesenc \T1, \XMM8, \XMM8
  715. vmovdqa TMP6(%rsp), \T1
  716. vmovdqa HashKey_3(arg1), \T5
  717. vpclmulqdq $0x11, \T5, \T1, \T3
  718. vpxor \T3, \T4, \T4
  719. vpclmulqdq $0x00, \T5, \T1, \T3
  720. vpxor \T3, \T7, \T7
  721. vpshufd $0b01001110, \T1, \T3
  722. vpxor \T1, \T3, \T3
  723. vmovdqa HashKey_3_k(arg1), \T5
  724. vpclmulqdq $0x10, \T5, \T3, \T3
  725. vpxor \T3, \T6, \T6
  726. vmovdqu 16*8(arg1), \T1
  727. vaesenc \T1, \XMM1, \XMM1
  728. vaesenc \T1, \XMM2, \XMM2
  729. vaesenc \T1, \XMM3, \XMM3
  730. vaesenc \T1, \XMM4, \XMM4
  731. vaesenc \T1, \XMM5, \XMM5
  732. vaesenc \T1, \XMM6, \XMM6
  733. vaesenc \T1, \XMM7, \XMM7
  734. vaesenc \T1, \XMM8, \XMM8
  735. vmovdqa TMP7(%rsp), \T1
  736. vmovdqa HashKey_2(arg1), \T5
  737. vpclmulqdq $0x11, \T5, \T1, \T3
  738. vpxor \T3, \T4, \T4
  739. vpclmulqdq $0x00, \T5, \T1, \T3
  740. vpxor \T3, \T7, \T7
  741. vpshufd $0b01001110, \T1, \T3
  742. vpxor \T1, \T3, \T3
  743. vmovdqa HashKey_2_k(arg1), \T5
  744. vpclmulqdq $0x10, \T5, \T3, \T3
  745. vpxor \T3, \T6, \T6
  746. #######################################################################
  747. vmovdqu 16*9(arg1), \T5
  748. vaesenc \T5, \XMM1, \XMM1
  749. vaesenc \T5, \XMM2, \XMM2
  750. vaesenc \T5, \XMM3, \XMM3
  751. vaesenc \T5, \XMM4, \XMM4
  752. vaesenc \T5, \XMM5, \XMM5
  753. vaesenc \T5, \XMM6, \XMM6
  754. vaesenc \T5, \XMM7, \XMM7
  755. vaesenc \T5, \XMM8, \XMM8
  756. vmovdqa TMP8(%rsp), \T1
  757. vmovdqa HashKey(arg1), \T5
  758. vpclmulqdq $0x11, \T5, \T1, \T3
  759. vpxor \T3, \T4, \T4
  760. vpclmulqdq $0x00, \T5, \T1, \T3
  761. vpxor \T3, \T7, \T7
  762. vpshufd $0b01001110, \T1, \T3
  763. vpxor \T1, \T3, \T3
  764. vmovdqa HashKey_k(arg1), \T5
  765. vpclmulqdq $0x10, \T5, \T3, \T3
  766. vpxor \T3, \T6, \T6
  767. vpxor \T4, \T6, \T6
  768. vpxor \T7, \T6, \T6
  769. vmovdqu 16*10(arg1), \T5
  770. i = 0
  771. j = 1
  772. setreg
  773. .rep 8
  774. vpxor 16*i(arg3, %r11), \T5, \T2
  775. .if \ENC_DEC == ENC
  776. vaesenclast \T2, reg_j, reg_j
  777. .else
  778. vaesenclast \T2, reg_j, \T3
  779. vmovdqu 16*i(arg3, %r11), reg_j
  780. vmovdqu \T3, 16*i(arg2, %r11)
  781. .endif
  782. i = (i+1)
  783. j = (j+1)
  784. setreg
  785. .endr
  786. #######################################################################
  787. vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
  788. vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
  789. vpxor \T3, \T7, \T7
  790. vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
  791. #######################################################################
  792. #first phase of the reduction
  793. #######################################################################
  794. vpslld $31, \T7, \T2 # packed right shifting << 31
  795. vpslld $30, \T7, \T3 # packed right shifting shift << 30
  796. vpslld $25, \T7, \T4 # packed right shifting shift << 25
  797. vpxor \T3, \T2, \T2 # xor the shifted versions
  798. vpxor \T4, \T2, \T2
  799. vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
  800. vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
  801. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  802. #######################################################################
  803. .if \ENC_DEC == ENC
  804. vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
  805. vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
  806. vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
  807. vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
  808. vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
  809. vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
  810. vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
  811. vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
  812. .endif
  813. #######################################################################
  814. #second phase of the reduction
  815. vpsrld $1, \T7, \T2 # packed left shifting >> 1
  816. vpsrld $2, \T7, \T3 # packed left shifting >> 2
  817. vpsrld $7, \T7, \T4 # packed left shifting >> 7
  818. vpxor \T3, \T2, \T2 # xor the shifted versions
  819. vpxor \T4, \T2, \T2
  820. vpxor \T1, \T2, \T2
  821. vpxor \T2, \T7, \T7
  822. vpxor \T7, \T6, \T6 # the result is in T6
  823. #######################################################################
  824. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  825. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  826. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  827. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  828. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  829. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  830. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  831. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  832. vpxor \T6, \XMM1, \XMM1
  833. .endm
  834. # GHASH the last 4 ciphertext blocks.
  835. .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
  836. ## Karatsuba Method
  837. vpshufd $0b01001110, \XMM1, \T2
  838. vpxor \XMM1, \T2, \T2
  839. vmovdqa HashKey_8(arg1), \T5
  840. vpclmulqdq $0x11, \T5, \XMM1, \T6
  841. vpclmulqdq $0x00, \T5, \XMM1, \T7
  842. vmovdqa HashKey_8_k(arg1), \T3
  843. vpclmulqdq $0x00, \T3, \T2, \XMM1
  844. ######################
  845. vpshufd $0b01001110, \XMM2, \T2
  846. vpxor \XMM2, \T2, \T2
  847. vmovdqa HashKey_7(arg1), \T5
  848. vpclmulqdq $0x11, \T5, \XMM2, \T4
  849. vpxor \T4, \T6, \T6
  850. vpclmulqdq $0x00, \T5, \XMM2, \T4
  851. vpxor \T4, \T7, \T7
  852. vmovdqa HashKey_7_k(arg1), \T3
  853. vpclmulqdq $0x00, \T3, \T2, \T2
  854. vpxor \T2, \XMM1, \XMM1
  855. ######################
  856. vpshufd $0b01001110, \XMM3, \T2
  857. vpxor \XMM3, \T2, \T2
  858. vmovdqa HashKey_6(arg1), \T5
  859. vpclmulqdq $0x11, \T5, \XMM3, \T4
  860. vpxor \T4, \T6, \T6
  861. vpclmulqdq $0x00, \T5, \XMM3, \T4
  862. vpxor \T4, \T7, \T7
  863. vmovdqa HashKey_6_k(arg1), \T3
  864. vpclmulqdq $0x00, \T3, \T2, \T2
  865. vpxor \T2, \XMM1, \XMM1
  866. ######################
  867. vpshufd $0b01001110, \XMM4, \T2
  868. vpxor \XMM4, \T2, \T2
  869. vmovdqa HashKey_5(arg1), \T5
  870. vpclmulqdq $0x11, \T5, \XMM4, \T4
  871. vpxor \T4, \T6, \T6
  872. vpclmulqdq $0x00, \T5, \XMM4, \T4
  873. vpxor \T4, \T7, \T7
  874. vmovdqa HashKey_5_k(arg1), \T3
  875. vpclmulqdq $0x00, \T3, \T2, \T2
  876. vpxor \T2, \XMM1, \XMM1
  877. ######################
  878. vpshufd $0b01001110, \XMM5, \T2
  879. vpxor \XMM5, \T2, \T2
  880. vmovdqa HashKey_4(arg1), \T5
  881. vpclmulqdq $0x11, \T5, \XMM5, \T4
  882. vpxor \T4, \T6, \T6
  883. vpclmulqdq $0x00, \T5, \XMM5, \T4
  884. vpxor \T4, \T7, \T7
  885. vmovdqa HashKey_4_k(arg1), \T3
  886. vpclmulqdq $0x00, \T3, \T2, \T2
  887. vpxor \T2, \XMM1, \XMM1
  888. ######################
  889. vpshufd $0b01001110, \XMM6, \T2
  890. vpxor \XMM6, \T2, \T2
  891. vmovdqa HashKey_3(arg1), \T5
  892. vpclmulqdq $0x11, \T5, \XMM6, \T4
  893. vpxor \T4, \T6, \T6
  894. vpclmulqdq $0x00, \T5, \XMM6, \T4
  895. vpxor \T4, \T7, \T7
  896. vmovdqa HashKey_3_k(arg1), \T3
  897. vpclmulqdq $0x00, \T3, \T2, \T2
  898. vpxor \T2, \XMM1, \XMM1
  899. ######################
  900. vpshufd $0b01001110, \XMM7, \T2
  901. vpxor \XMM7, \T2, \T2
  902. vmovdqa HashKey_2(arg1), \T5
  903. vpclmulqdq $0x11, \T5, \XMM7, \T4
  904. vpxor \T4, \T6, \T6
  905. vpclmulqdq $0x00, \T5, \XMM7, \T4
  906. vpxor \T4, \T7, \T7
  907. vmovdqa HashKey_2_k(arg1), \T3
  908. vpclmulqdq $0x00, \T3, \T2, \T2
  909. vpxor \T2, \XMM1, \XMM1
  910. ######################
  911. vpshufd $0b01001110, \XMM8, \T2
  912. vpxor \XMM8, \T2, \T2
  913. vmovdqa HashKey(arg1), \T5
  914. vpclmulqdq $0x11, \T5, \XMM8, \T4
  915. vpxor \T4, \T6, \T6
  916. vpclmulqdq $0x00, \T5, \XMM8, \T4
  917. vpxor \T4, \T7, \T7
  918. vmovdqa HashKey_k(arg1), \T3
  919. vpclmulqdq $0x00, \T3, \T2, \T2
  920. vpxor \T2, \XMM1, \XMM1
  921. vpxor \T6, \XMM1, \XMM1
  922. vpxor \T7, \XMM1, \T2
  923. vpslldq $8, \T2, \T4
  924. vpsrldq $8, \T2, \T2
  925. vpxor \T4, \T7, \T7
  926. vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
  927. # the accumulated carry-less multiplications
  928. #######################################################################
  929. #first phase of the reduction
  930. vpslld $31, \T7, \T2 # packed right shifting << 31
  931. vpslld $30, \T7, \T3 # packed right shifting shift << 30
  932. vpslld $25, \T7, \T4 # packed right shifting shift << 25
  933. vpxor \T3, \T2, \T2 # xor the shifted versions
  934. vpxor \T4, \T2, \T2
  935. vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
  936. vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
  937. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  938. #######################################################################
  939. #second phase of the reduction
  940. vpsrld $1, \T7, \T2 # packed left shifting >> 1
  941. vpsrld $2, \T7, \T3 # packed left shifting >> 2
  942. vpsrld $7, \T7, \T4 # packed left shifting >> 7
  943. vpxor \T3, \T2, \T2 # xor the shifted versions
  944. vpxor \T4, \T2, \T2
  945. vpxor \T1, \T2, \T2
  946. vpxor \T2, \T7, \T7
  947. vpxor \T7, \T6, \T6 # the result is in T6
  948. .endm
  949. # combined for GCM encrypt and decrypt functions
  950. # clobbering all xmm registers
  951. # clobbering r10, r11, r12, r13, r14, r15
  952. .macro GCM_ENC_DEC_AVX ENC_DEC
  953. #the number of pushes must equal STACK_OFFSET
  954. push %r12
  955. push %r13
  956. push %r14
  957. push %r15
  958. mov %rsp, %r14
  959. sub $VARIABLE_OFFSET, %rsp
  960. and $~63, %rsp # align rsp to 64 bytes
  961. vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
  962. mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
  963. and $-16, %r13 # r13 = r13 - (r13 mod 16)
  964. mov %r13, %r12
  965. shr $4, %r12
  966. and $7, %r12
  967. jz _initial_num_blocks_is_0\@
  968. cmp $7, %r12
  969. je _initial_num_blocks_is_7\@
  970. cmp $6, %r12
  971. je _initial_num_blocks_is_6\@
  972. cmp $5, %r12
  973. je _initial_num_blocks_is_5\@
  974. cmp $4, %r12
  975. je _initial_num_blocks_is_4\@
  976. cmp $3, %r12
  977. je _initial_num_blocks_is_3\@
  978. cmp $2, %r12
  979. je _initial_num_blocks_is_2\@
  980. jmp _initial_num_blocks_is_1\@
  981. _initial_num_blocks_is_7\@:
  982. INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  983. sub $16*7, %r13
  984. jmp _initial_blocks_encrypted\@
  985. _initial_num_blocks_is_6\@:
  986. INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  987. sub $16*6, %r13
  988. jmp _initial_blocks_encrypted\@
  989. _initial_num_blocks_is_5\@:
  990. INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  991. sub $16*5, %r13
  992. jmp _initial_blocks_encrypted\@
  993. _initial_num_blocks_is_4\@:
  994. INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  995. sub $16*4, %r13
  996. jmp _initial_blocks_encrypted\@
  997. _initial_num_blocks_is_3\@:
  998. INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  999. sub $16*3, %r13
  1000. jmp _initial_blocks_encrypted\@
  1001. _initial_num_blocks_is_2\@:
  1002. INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1003. sub $16*2, %r13
  1004. jmp _initial_blocks_encrypted\@
  1005. _initial_num_blocks_is_1\@:
  1006. INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1007. sub $16*1, %r13
  1008. jmp _initial_blocks_encrypted\@
  1009. _initial_num_blocks_is_0\@:
  1010. INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1011. _initial_blocks_encrypted\@:
  1012. cmp $0, %r13
  1013. je _zero_cipher_left\@
  1014. sub $128, %r13
  1015. je _eight_cipher_left\@
  1016. vmovd %xmm9, %r15d
  1017. and $255, %r15d
  1018. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1019. _encrypt_by_8_new\@:
  1020. cmp $(255-8), %r15d
  1021. jg _encrypt_by_8\@
  1022. add $8, %r15b
  1023. GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
  1024. add $128, %r11
  1025. sub $128, %r13
  1026. jne _encrypt_by_8_new\@
  1027. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1028. jmp _eight_cipher_left\@
  1029. _encrypt_by_8\@:
  1030. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1031. add $8, %r15b
  1032. GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
  1033. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1034. add $128, %r11
  1035. sub $128, %r13
  1036. jne _encrypt_by_8_new\@
  1037. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1038. _eight_cipher_left\@:
  1039. GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
  1040. _zero_cipher_left\@:
  1041. cmp $16, arg4
  1042. jl _only_less_than_16\@
  1043. mov arg4, %r13
  1044. and $15, %r13 # r13 = (arg4 mod 16)
  1045. je _multiple_of_16_bytes\@
  1046. # handle the last <16 Byte block seperately
  1047. vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
  1048. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1049. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
  1050. sub $16, %r11
  1051. add %r13, %r11
  1052. vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
  1053. lea SHIFT_MASK+16(%rip), %r12
  1054. sub %r13, %r12 # adjust the shuffle mask pointer to be
  1055. # able to shift 16-r13 bytes (r13 is the
  1056. # number of bytes in plaintext mod 16)
  1057. vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1058. vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
  1059. jmp _final_ghash_mul\@
  1060. _only_less_than_16\@:
  1061. # check for 0 length
  1062. mov arg4, %r13
  1063. and $15, %r13 # r13 = (arg4 mod 16)
  1064. je _multiple_of_16_bytes\@
  1065. # handle the last <16 Byte block seperately
  1066. vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
  1067. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1068. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
  1069. lea SHIFT_MASK+16(%rip), %r12
  1070. sub %r13, %r12 # adjust the shuffle mask pointer to be
  1071. # able to shift 16-r13 bytes (r13 is the
  1072. # number of bytes in plaintext mod 16)
  1073. _get_last_16_byte_loop\@:
  1074. movb (arg3, %r11), %al
  1075. movb %al, TMP1 (%rsp , %r11)
  1076. add $1, %r11
  1077. cmp %r13, %r11
  1078. jne _get_last_16_byte_loop\@
  1079. vmovdqu TMP1(%rsp), %xmm1
  1080. sub $16, %r11
  1081. _final_ghash_mul\@:
  1082. .if \ENC_DEC == DEC
  1083. vmovdqa %xmm1, %xmm2
  1084. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  1085. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
  1086. # mask out top 16-r13 bytes of xmm9
  1087. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  1088. vpand %xmm1, %xmm2, %xmm2
  1089. vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
  1090. vpxor %xmm2, %xmm14, %xmm14
  1091. #GHASH computation for the last <16 Byte block
  1092. GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  1093. sub %r13, %r11
  1094. add $16, %r11
  1095. .else
  1096. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  1097. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
  1098. # mask out top 16-r13 bytes of xmm9
  1099. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  1100. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1101. vpxor %xmm9, %xmm14, %xmm14
  1102. #GHASH computation for the last <16 Byte block
  1103. GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  1104. sub %r13, %r11
  1105. add $16, %r11
  1106. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
  1107. .endif
  1108. #############################
  1109. # output r13 Bytes
  1110. vmovq %xmm9, %rax
  1111. cmp $8, %r13
  1112. jle _less_than_8_bytes_left\@
  1113. mov %rax, (arg2 , %r11)
  1114. add $8, %r11
  1115. vpsrldq $8, %xmm9, %xmm9
  1116. vmovq %xmm9, %rax
  1117. sub $8, %r13
  1118. _less_than_8_bytes_left\@:
  1119. movb %al, (arg2 , %r11)
  1120. add $1, %r11
  1121. shr $8, %rax
  1122. sub $1, %r13
  1123. jne _less_than_8_bytes_left\@
  1124. #############################
  1125. _multiple_of_16_bytes\@:
  1126. mov arg7, %r12 # r12 = aadLen (number of bytes)
  1127. shl $3, %r12 # convert into number of bits
  1128. vmovd %r12d, %xmm15 # len(A) in xmm15
  1129. shl $3, arg4 # len(C) in bits (*128)
  1130. vmovq arg4, %xmm1
  1131. vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
  1132. vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
  1133. vpxor %xmm15, %xmm14, %xmm14
  1134. GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
  1135. vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
  1136. mov arg5, %rax # rax = *Y0
  1137. vmovdqu (%rax), %xmm9 # xmm9 = Y0
  1138. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
  1139. vpxor %xmm14, %xmm9, %xmm9
  1140. _return_T\@:
  1141. mov arg8, %r10 # r10 = authTag
  1142. mov arg9, %r11 # r11 = auth_tag_len
  1143. cmp $16, %r11
  1144. je _T_16\@
  1145. cmp $12, %r11
  1146. je _T_12\@
  1147. _T_8\@:
  1148. vmovq %xmm9, %rax
  1149. mov %rax, (%r10)
  1150. jmp _return_T_done\@
  1151. _T_12\@:
  1152. vmovq %xmm9, %rax
  1153. mov %rax, (%r10)
  1154. vpsrldq $8, %xmm9, %xmm9
  1155. vmovd %xmm9, %eax
  1156. mov %eax, 8(%r10)
  1157. jmp _return_T_done\@
  1158. _T_16\@:
  1159. vmovdqu %xmm9, (%r10)
  1160. _return_T_done\@:
  1161. mov %r14, %rsp
  1162. pop %r15
  1163. pop %r14
  1164. pop %r13
  1165. pop %r12
  1166. .endm
  1167. #############################################################
  1168. #void aesni_gcm_precomp_avx_gen2
  1169. # (gcm_data *my_ctx_data,
  1170. # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
  1171. #############################################################
  1172. ENTRY(aesni_gcm_precomp_avx_gen2)
  1173. #the number of pushes must equal STACK_OFFSET
  1174. push %r12
  1175. push %r13
  1176. push %r14
  1177. push %r15
  1178. mov %rsp, %r14
  1179. sub $VARIABLE_OFFSET, %rsp
  1180. and $~63, %rsp # align rsp to 64 bytes
  1181. vmovdqu (arg2), %xmm6 # xmm6 = HashKey
  1182. vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
  1183. ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
  1184. vmovdqa %xmm6, %xmm2
  1185. vpsllq $1, %xmm6, %xmm6
  1186. vpsrlq $63, %xmm2, %xmm2
  1187. vmovdqa %xmm2, %xmm1
  1188. vpslldq $8, %xmm2, %xmm2
  1189. vpsrldq $8, %xmm1, %xmm1
  1190. vpor %xmm2, %xmm6, %xmm6
  1191. #reduction
  1192. vpshufd $0b00100100, %xmm1, %xmm2
  1193. vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
  1194. vpand POLY(%rip), %xmm2, %xmm2
  1195. vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
  1196. #######################################################################
  1197. vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
  1198. PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
  1199. mov %r14, %rsp
  1200. pop %r15
  1201. pop %r14
  1202. pop %r13
  1203. pop %r12
  1204. ret
  1205. ENDPROC(aesni_gcm_precomp_avx_gen2)
  1206. ###############################################################################
  1207. #void aesni_gcm_enc_avx_gen2(
  1208. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  1209. # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
  1210. # const u8 *in, /* Plaintext input */
  1211. # u64 plaintext_len, /* Length of data in Bytes for encryption. */
  1212. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  1213. # (from Security Association) concatenated with 8 byte
  1214. # Initialisation Vector (from IPSec ESP Payload)
  1215. # concatenated with 0x00000001. 16-byte aligned pointer. */
  1216. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  1217. # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  1218. # u8 *auth_tag, /* Authenticated Tag output. */
  1219. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  1220. # Valid values are 16 (most likely), 12 or 8. */
  1221. ###############################################################################
  1222. ENTRY(aesni_gcm_enc_avx_gen2)
  1223. GCM_ENC_DEC_AVX ENC
  1224. ret
  1225. ENDPROC(aesni_gcm_enc_avx_gen2)
  1226. ###############################################################################
  1227. #void aesni_gcm_dec_avx_gen2(
  1228. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  1229. # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
  1230. # const u8 *in, /* Ciphertext input */
  1231. # u64 plaintext_len, /* Length of data in Bytes for encryption. */
  1232. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  1233. # (from Security Association) concatenated with 8 byte
  1234. # Initialisation Vector (from IPSec ESP Payload)
  1235. # concatenated with 0x00000001. 16-byte aligned pointer. */
  1236. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  1237. # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  1238. # u8 *auth_tag, /* Authenticated Tag output. */
  1239. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  1240. # Valid values are 16 (most likely), 12 or 8. */
  1241. ###############################################################################
  1242. ENTRY(aesni_gcm_dec_avx_gen2)
  1243. GCM_ENC_DEC_AVX DEC
  1244. ret
  1245. ENDPROC(aesni_gcm_dec_avx_gen2)
  1246. #endif /* CONFIG_AS_AVX */
  1247. #ifdef CONFIG_AS_AVX2
  1248. ###############################################################################
  1249. # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  1250. # Input: A and B (128-bits each, bit-reflected)
  1251. # Output: C = A*B*x mod poly, (i.e. >>1 )
  1252. # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  1253. # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  1254. ###############################################################################
  1255. .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
  1256. vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
  1257. vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
  1258. vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
  1259. vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
  1260. vpxor \T3, \GH, \GH
  1261. vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
  1262. vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
  1263. vpxor \T3, \T1, \T1
  1264. vpxor \T2, \GH, \GH
  1265. #######################################################################
  1266. #first phase of the reduction
  1267. vmovdqa POLY2(%rip), \T3
  1268. vpclmulqdq $0x01, \GH, \T3, \T2
  1269. vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
  1270. vpxor \T2, \GH, \GH # first phase of the reduction complete
  1271. #######################################################################
  1272. #second phase of the reduction
  1273. vpclmulqdq $0x00, \GH, \T3, \T2
  1274. vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
  1275. vpclmulqdq $0x10, \GH, \T3, \GH
  1276. vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
  1277. vpxor \T2, \GH, \GH # second phase of the reduction complete
  1278. #######################################################################
  1279. vpxor \T1, \GH, \GH # the result is in GH
  1280. .endm
  1281. .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
  1282. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  1283. vmovdqa \HK, \T5
  1284. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
  1285. vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
  1286. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
  1287. vmovdqa \T5, HashKey_3(arg1)
  1288. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
  1289. vmovdqa \T5, HashKey_4(arg1)
  1290. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
  1291. vmovdqa \T5, HashKey_5(arg1)
  1292. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
  1293. vmovdqa \T5, HashKey_6(arg1)
  1294. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
  1295. vmovdqa \T5, HashKey_7(arg1)
  1296. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
  1297. vmovdqa \T5, HashKey_8(arg1)
  1298. .endm
  1299. ## if a = number of total plaintext bytes
  1300. ## b = floor(a/16)
  1301. ## num_initial_blocks = b mod 4#
  1302. ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
  1303. ## r10, r11, r12, rax are clobbered
  1304. ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
  1305. .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
  1306. i = (8-\num_initial_blocks)
  1307. setreg
  1308. mov arg6, %r10 # r10 = AAD
  1309. mov arg7, %r12 # r12 = aadLen
  1310. mov %r12, %r11
  1311. vpxor reg_i, reg_i, reg_i
  1312. _get_AAD_loop\@:
  1313. vmovd (%r10), \T1
  1314. vpslldq $12, \T1, \T1
  1315. vpsrldq $4, reg_i, reg_i
  1316. vpxor \T1, reg_i, reg_i
  1317. add $4, %r10
  1318. sub $4, %r12
  1319. jg _get_AAD_loop\@
  1320. cmp $16, %r11
  1321. je _get_AAD_loop2_done\@
  1322. mov $16, %r12
  1323. _get_AAD_loop2\@:
  1324. vpsrldq $4, reg_i, reg_i
  1325. sub $4, %r12
  1326. cmp %r11, %r12
  1327. jg _get_AAD_loop2\@
  1328. _get_AAD_loop2_done\@:
  1329. #byte-reflect the AAD data
  1330. vpshufb SHUF_MASK(%rip), reg_i, reg_i
  1331. # initialize the data pointer offset as zero
  1332. xor %r11, %r11
  1333. # start AES for num_initial_blocks blocks
  1334. mov arg5, %rax # rax = *Y0
  1335. vmovdqu (%rax), \CTR # CTR = Y0
  1336. vpshufb SHUF_MASK(%rip), \CTR, \CTR
  1337. i = (9-\num_initial_blocks)
  1338. setreg
  1339. .rep \num_initial_blocks
  1340. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1341. vmovdqa \CTR, reg_i
  1342. vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
  1343. i = (i+1)
  1344. setreg
  1345. .endr
  1346. vmovdqa (arg1), \T_key
  1347. i = (9-\num_initial_blocks)
  1348. setreg
  1349. .rep \num_initial_blocks
  1350. vpxor \T_key, reg_i, reg_i
  1351. i = (i+1)
  1352. setreg
  1353. .endr
  1354. j = 1
  1355. setreg
  1356. .rep 9
  1357. vmovdqa 16*j(arg1), \T_key
  1358. i = (9-\num_initial_blocks)
  1359. setreg
  1360. .rep \num_initial_blocks
  1361. vaesenc \T_key, reg_i, reg_i
  1362. i = (i+1)
  1363. setreg
  1364. .endr
  1365. j = (j+1)
  1366. setreg
  1367. .endr
  1368. vmovdqa 16*10(arg1), \T_key
  1369. i = (9-\num_initial_blocks)
  1370. setreg
  1371. .rep \num_initial_blocks
  1372. vaesenclast \T_key, reg_i, reg_i
  1373. i = (i+1)
  1374. setreg
  1375. .endr
  1376. i = (9-\num_initial_blocks)
  1377. setreg
  1378. .rep \num_initial_blocks
  1379. vmovdqu (arg3, %r11), \T1
  1380. vpxor \T1, reg_i, reg_i
  1381. vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
  1382. # num_initial_blocks blocks
  1383. add $16, %r11
  1384. .if \ENC_DEC == DEC
  1385. vmovdqa \T1, reg_i
  1386. .endif
  1387. vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
  1388. i = (i+1)
  1389. setreg
  1390. .endr
  1391. i = (8-\num_initial_blocks)
  1392. j = (9-\num_initial_blocks)
  1393. setreg
  1394. GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
  1395. .rep \num_initial_blocks
  1396. vpxor reg_i, reg_j, reg_j
  1397. GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
  1398. i = (i+1)
  1399. j = (j+1)
  1400. setreg
  1401. .endr
  1402. # XMM8 has the combined result here
  1403. vmovdqa \XMM8, TMP1(%rsp)
  1404. vmovdqa \XMM8, \T3
  1405. cmp $128, %r13
  1406. jl _initial_blocks_done\@ # no need for precomputed constants
  1407. ###############################################################################
  1408. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  1409. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1410. vmovdqa \CTR, \XMM1
  1411. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1412. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1413. vmovdqa \CTR, \XMM2
  1414. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1415. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1416. vmovdqa \CTR, \XMM3
  1417. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1418. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1419. vmovdqa \CTR, \XMM4
  1420. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1421. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1422. vmovdqa \CTR, \XMM5
  1423. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1424. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1425. vmovdqa \CTR, \XMM6
  1426. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1427. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1428. vmovdqa \CTR, \XMM7
  1429. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1430. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1431. vmovdqa \CTR, \XMM8
  1432. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1433. vmovdqa (arg1), \T_key
  1434. vpxor \T_key, \XMM1, \XMM1
  1435. vpxor \T_key, \XMM2, \XMM2
  1436. vpxor \T_key, \XMM3, \XMM3
  1437. vpxor \T_key, \XMM4, \XMM4
  1438. vpxor \T_key, \XMM5, \XMM5
  1439. vpxor \T_key, \XMM6, \XMM6
  1440. vpxor \T_key, \XMM7, \XMM7
  1441. vpxor \T_key, \XMM8, \XMM8
  1442. i = 1
  1443. setreg
  1444. .rep 9 # do 9 rounds
  1445. vmovdqa 16*i(arg1), \T_key
  1446. vaesenc \T_key, \XMM1, \XMM1
  1447. vaesenc \T_key, \XMM2, \XMM2
  1448. vaesenc \T_key, \XMM3, \XMM3
  1449. vaesenc \T_key, \XMM4, \XMM4
  1450. vaesenc \T_key, \XMM5, \XMM5
  1451. vaesenc \T_key, \XMM6, \XMM6
  1452. vaesenc \T_key, \XMM7, \XMM7
  1453. vaesenc \T_key, \XMM8, \XMM8
  1454. i = (i+1)
  1455. setreg
  1456. .endr
  1457. vmovdqa 16*i(arg1), \T_key
  1458. vaesenclast \T_key, \XMM1, \XMM1
  1459. vaesenclast \T_key, \XMM2, \XMM2
  1460. vaesenclast \T_key, \XMM3, \XMM3
  1461. vaesenclast \T_key, \XMM4, \XMM4
  1462. vaesenclast \T_key, \XMM5, \XMM5
  1463. vaesenclast \T_key, \XMM6, \XMM6
  1464. vaesenclast \T_key, \XMM7, \XMM7
  1465. vaesenclast \T_key, \XMM8, \XMM8
  1466. vmovdqu (arg3, %r11), \T1
  1467. vpxor \T1, \XMM1, \XMM1
  1468. vmovdqu \XMM1, (arg2 , %r11)
  1469. .if \ENC_DEC == DEC
  1470. vmovdqa \T1, \XMM1
  1471. .endif
  1472. vmovdqu 16*1(arg3, %r11), \T1
  1473. vpxor \T1, \XMM2, \XMM2
  1474. vmovdqu \XMM2, 16*1(arg2 , %r11)
  1475. .if \ENC_DEC == DEC
  1476. vmovdqa \T1, \XMM2
  1477. .endif
  1478. vmovdqu 16*2(arg3, %r11), \T1
  1479. vpxor \T1, \XMM3, \XMM3
  1480. vmovdqu \XMM3, 16*2(arg2 , %r11)
  1481. .if \ENC_DEC == DEC
  1482. vmovdqa \T1, \XMM3
  1483. .endif
  1484. vmovdqu 16*3(arg3, %r11), \T1
  1485. vpxor \T1, \XMM4, \XMM4
  1486. vmovdqu \XMM4, 16*3(arg2 , %r11)
  1487. .if \ENC_DEC == DEC
  1488. vmovdqa \T1, \XMM4
  1489. .endif
  1490. vmovdqu 16*4(arg3, %r11), \T1
  1491. vpxor \T1, \XMM5, \XMM5
  1492. vmovdqu \XMM5, 16*4(arg2 , %r11)
  1493. .if \ENC_DEC == DEC
  1494. vmovdqa \T1, \XMM5
  1495. .endif
  1496. vmovdqu 16*5(arg3, %r11), \T1
  1497. vpxor \T1, \XMM6, \XMM6
  1498. vmovdqu \XMM6, 16*5(arg2 , %r11)
  1499. .if \ENC_DEC == DEC
  1500. vmovdqa \T1, \XMM6
  1501. .endif
  1502. vmovdqu 16*6(arg3, %r11), \T1
  1503. vpxor \T1, \XMM7, \XMM7
  1504. vmovdqu \XMM7, 16*6(arg2 , %r11)
  1505. .if \ENC_DEC == DEC
  1506. vmovdqa \T1, \XMM7
  1507. .endif
  1508. vmovdqu 16*7(arg3, %r11), \T1
  1509. vpxor \T1, \XMM8, \XMM8
  1510. vmovdqu \XMM8, 16*7(arg2 , %r11)
  1511. .if \ENC_DEC == DEC
  1512. vmovdqa \T1, \XMM8
  1513. .endif
  1514. add $128, %r11
  1515. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1516. vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
  1517. # the corresponding ciphertext
  1518. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1519. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1520. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1521. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1522. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1523. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1524. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1525. ###############################################################################
  1526. _initial_blocks_done\@:
  1527. .endm
  1528. # encrypt 8 blocks at a time
  1529. # ghash the 8 previously encrypted ciphertext blocks
  1530. # arg1, arg2, arg3 are used as pointers only, not modified
  1531. # r11 is the data offset value
  1532. .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
  1533. vmovdqa \XMM1, \T2
  1534. vmovdqa \XMM2, TMP2(%rsp)
  1535. vmovdqa \XMM3, TMP3(%rsp)
  1536. vmovdqa \XMM4, TMP4(%rsp)
  1537. vmovdqa \XMM5, TMP5(%rsp)
  1538. vmovdqa \XMM6, TMP6(%rsp)
  1539. vmovdqa \XMM7, TMP7(%rsp)
  1540. vmovdqa \XMM8, TMP8(%rsp)
  1541. .if \loop_idx == in_order
  1542. vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
  1543. vpaddd ONE(%rip), \XMM1, \XMM2
  1544. vpaddd ONE(%rip), \XMM2, \XMM3
  1545. vpaddd ONE(%rip), \XMM3, \XMM4
  1546. vpaddd ONE(%rip), \XMM4, \XMM5
  1547. vpaddd ONE(%rip), \XMM5, \XMM6
  1548. vpaddd ONE(%rip), \XMM6, \XMM7
  1549. vpaddd ONE(%rip), \XMM7, \XMM8
  1550. vmovdqa \XMM8, \CTR
  1551. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1552. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1553. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1554. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1555. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1556. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1557. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1558. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1559. .else
  1560. vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
  1561. vpaddd ONEf(%rip), \XMM1, \XMM2
  1562. vpaddd ONEf(%rip), \XMM2, \XMM3
  1563. vpaddd ONEf(%rip), \XMM3, \XMM4
  1564. vpaddd ONEf(%rip), \XMM4, \XMM5
  1565. vpaddd ONEf(%rip), \XMM5, \XMM6
  1566. vpaddd ONEf(%rip), \XMM6, \XMM7
  1567. vpaddd ONEf(%rip), \XMM7, \XMM8
  1568. vmovdqa \XMM8, \CTR
  1569. .endif
  1570. #######################################################################
  1571. vmovdqu (arg1), \T1
  1572. vpxor \T1, \XMM1, \XMM1
  1573. vpxor \T1, \XMM2, \XMM2
  1574. vpxor \T1, \XMM3, \XMM3
  1575. vpxor \T1, \XMM4, \XMM4
  1576. vpxor \T1, \XMM5, \XMM5
  1577. vpxor \T1, \XMM6, \XMM6
  1578. vpxor \T1, \XMM7, \XMM7
  1579. vpxor \T1, \XMM8, \XMM8
  1580. #######################################################################
  1581. vmovdqu 16*1(arg1), \T1
  1582. vaesenc \T1, \XMM1, \XMM1
  1583. vaesenc \T1, \XMM2, \XMM2
  1584. vaesenc \T1, \XMM3, \XMM3
  1585. vaesenc \T1, \XMM4, \XMM4
  1586. vaesenc \T1, \XMM5, \XMM5
  1587. vaesenc \T1, \XMM6, \XMM6
  1588. vaesenc \T1, \XMM7, \XMM7
  1589. vaesenc \T1, \XMM8, \XMM8
  1590. vmovdqu 16*2(arg1), \T1
  1591. vaesenc \T1, \XMM1, \XMM1
  1592. vaesenc \T1, \XMM2, \XMM2
  1593. vaesenc \T1, \XMM3, \XMM3
  1594. vaesenc \T1, \XMM4, \XMM4
  1595. vaesenc \T1, \XMM5, \XMM5
  1596. vaesenc \T1, \XMM6, \XMM6
  1597. vaesenc \T1, \XMM7, \XMM7
  1598. vaesenc \T1, \XMM8, \XMM8
  1599. #######################################################################
  1600. vmovdqa HashKey_8(arg1), \T5
  1601. vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
  1602. vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
  1603. vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
  1604. vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
  1605. vpxor \T5, \T6, \T6
  1606. vmovdqu 16*3(arg1), \T1
  1607. vaesenc \T1, \XMM1, \XMM1
  1608. vaesenc \T1, \XMM2, \XMM2
  1609. vaesenc \T1, \XMM3, \XMM3
  1610. vaesenc \T1, \XMM4, \XMM4
  1611. vaesenc \T1, \XMM5, \XMM5
  1612. vaesenc \T1, \XMM6, \XMM6
  1613. vaesenc \T1, \XMM7, \XMM7
  1614. vaesenc \T1, \XMM8, \XMM8
  1615. vmovdqa TMP2(%rsp), \T1
  1616. vmovdqa HashKey_7(arg1), \T5
  1617. vpclmulqdq $0x11, \T5, \T1, \T3
  1618. vpxor \T3, \T4, \T4
  1619. vpclmulqdq $0x00, \T5, \T1, \T3
  1620. vpxor \T3, \T7, \T7
  1621. vpclmulqdq $0x01, \T5, \T1, \T3
  1622. vpxor \T3, \T6, \T6
  1623. vpclmulqdq $0x10, \T5, \T1, \T3
  1624. vpxor \T3, \T6, \T6
  1625. vmovdqu 16*4(arg1), \T1
  1626. vaesenc \T1, \XMM1, \XMM1
  1627. vaesenc \T1, \XMM2, \XMM2
  1628. vaesenc \T1, \XMM3, \XMM3
  1629. vaesenc \T1, \XMM4, \XMM4
  1630. vaesenc \T1, \XMM5, \XMM5
  1631. vaesenc \T1, \XMM6, \XMM6
  1632. vaesenc \T1, \XMM7, \XMM7
  1633. vaesenc \T1, \XMM8, \XMM8
  1634. #######################################################################
  1635. vmovdqa TMP3(%rsp), \T1
  1636. vmovdqa HashKey_6(arg1), \T5
  1637. vpclmulqdq $0x11, \T5, \T1, \T3
  1638. vpxor \T3, \T4, \T4
  1639. vpclmulqdq $0x00, \T5, \T1, \T3
  1640. vpxor \T3, \T7, \T7
  1641. vpclmulqdq $0x01, \T5, \T1, \T3
  1642. vpxor \T3, \T6, \T6
  1643. vpclmulqdq $0x10, \T5, \T1, \T3
  1644. vpxor \T3, \T6, \T6
  1645. vmovdqu 16*5(arg1), \T1
  1646. vaesenc \T1, \XMM1, \XMM1
  1647. vaesenc \T1, \XMM2, \XMM2
  1648. vaesenc \T1, \XMM3, \XMM3
  1649. vaesenc \T1, \XMM4, \XMM4
  1650. vaesenc \T1, \XMM5, \XMM5
  1651. vaesenc \T1, \XMM6, \XMM6
  1652. vaesenc \T1, \XMM7, \XMM7
  1653. vaesenc \T1, \XMM8, \XMM8
  1654. vmovdqa TMP4(%rsp), \T1
  1655. vmovdqa HashKey_5(arg1), \T5
  1656. vpclmulqdq $0x11, \T5, \T1, \T3
  1657. vpxor \T3, \T4, \T4
  1658. vpclmulqdq $0x00, \T5, \T1, \T3
  1659. vpxor \T3, \T7, \T7
  1660. vpclmulqdq $0x01, \T5, \T1, \T3
  1661. vpxor \T3, \T6, \T6
  1662. vpclmulqdq $0x10, \T5, \T1, \T3
  1663. vpxor \T3, \T6, \T6
  1664. vmovdqu 16*6(arg1), \T1
  1665. vaesenc \T1, \XMM1, \XMM1
  1666. vaesenc \T1, \XMM2, \XMM2
  1667. vaesenc \T1, \XMM3, \XMM3
  1668. vaesenc \T1, \XMM4, \XMM4
  1669. vaesenc \T1, \XMM5, \XMM5
  1670. vaesenc \T1, \XMM6, \XMM6
  1671. vaesenc \T1, \XMM7, \XMM7
  1672. vaesenc \T1, \XMM8, \XMM8
  1673. vmovdqa TMP5(%rsp), \T1
  1674. vmovdqa HashKey_4(arg1), \T5
  1675. vpclmulqdq $0x11, \T5, \T1, \T3
  1676. vpxor \T3, \T4, \T4
  1677. vpclmulqdq $0x00, \T5, \T1, \T3
  1678. vpxor \T3, \T7, \T7
  1679. vpclmulqdq $0x01, \T5, \T1, \T3
  1680. vpxor \T3, \T6, \T6
  1681. vpclmulqdq $0x10, \T5, \T1, \T3
  1682. vpxor \T3, \T6, \T6
  1683. vmovdqu 16*7(arg1), \T1
  1684. vaesenc \T1, \XMM1, \XMM1
  1685. vaesenc \T1, \XMM2, \XMM2
  1686. vaesenc \T1, \XMM3, \XMM3
  1687. vaesenc \T1, \XMM4, \XMM4
  1688. vaesenc \T1, \XMM5, \XMM5
  1689. vaesenc \T1, \XMM6, \XMM6
  1690. vaesenc \T1, \XMM7, \XMM7
  1691. vaesenc \T1, \XMM8, \XMM8
  1692. vmovdqa TMP6(%rsp), \T1
  1693. vmovdqa HashKey_3(arg1), \T5
  1694. vpclmulqdq $0x11, \T5, \T1, \T3
  1695. vpxor \T3, \T4, \T4
  1696. vpclmulqdq $0x00, \T5, \T1, \T3
  1697. vpxor \T3, \T7, \T7
  1698. vpclmulqdq $0x01, \T5, \T1, \T3
  1699. vpxor \T3, \T6, \T6
  1700. vpclmulqdq $0x10, \T5, \T1, \T3
  1701. vpxor \T3, \T6, \T6
  1702. vmovdqu 16*8(arg1), \T1
  1703. vaesenc \T1, \XMM1, \XMM1
  1704. vaesenc \T1, \XMM2, \XMM2
  1705. vaesenc \T1, \XMM3, \XMM3
  1706. vaesenc \T1, \XMM4, \XMM4
  1707. vaesenc \T1, \XMM5, \XMM5
  1708. vaesenc \T1, \XMM6, \XMM6
  1709. vaesenc \T1, \XMM7, \XMM7
  1710. vaesenc \T1, \XMM8, \XMM8
  1711. vmovdqa TMP7(%rsp), \T1
  1712. vmovdqa HashKey_2(arg1), \T5
  1713. vpclmulqdq $0x11, \T5, \T1, \T3
  1714. vpxor \T3, \T4, \T4
  1715. vpclmulqdq $0x00, \T5, \T1, \T3
  1716. vpxor \T3, \T7, \T7
  1717. vpclmulqdq $0x01, \T5, \T1, \T3
  1718. vpxor \T3, \T6, \T6
  1719. vpclmulqdq $0x10, \T5, \T1, \T3
  1720. vpxor \T3, \T6, \T6
  1721. #######################################################################
  1722. vmovdqu 16*9(arg1), \T5
  1723. vaesenc \T5, \XMM1, \XMM1
  1724. vaesenc \T5, \XMM2, \XMM2
  1725. vaesenc \T5, \XMM3, \XMM3
  1726. vaesenc \T5, \XMM4, \XMM4
  1727. vaesenc \T5, \XMM5, \XMM5
  1728. vaesenc \T5, \XMM6, \XMM6
  1729. vaesenc \T5, \XMM7, \XMM7
  1730. vaesenc \T5, \XMM8, \XMM8
  1731. vmovdqa TMP8(%rsp), \T1
  1732. vmovdqa HashKey(arg1), \T5
  1733. vpclmulqdq $0x00, \T5, \T1, \T3
  1734. vpxor \T3, \T7, \T7
  1735. vpclmulqdq $0x01, \T5, \T1, \T3
  1736. vpxor \T3, \T6, \T6
  1737. vpclmulqdq $0x10, \T5, \T1, \T3
  1738. vpxor \T3, \T6, \T6
  1739. vpclmulqdq $0x11, \T5, \T1, \T3
  1740. vpxor \T3, \T4, \T1
  1741. vmovdqu 16*10(arg1), \T5
  1742. i = 0
  1743. j = 1
  1744. setreg
  1745. .rep 8
  1746. vpxor 16*i(arg3, %r11), \T5, \T2
  1747. .if \ENC_DEC == ENC
  1748. vaesenclast \T2, reg_j, reg_j
  1749. .else
  1750. vaesenclast \T2, reg_j, \T3
  1751. vmovdqu 16*i(arg3, %r11), reg_j
  1752. vmovdqu \T3, 16*i(arg2, %r11)
  1753. .endif
  1754. i = (i+1)
  1755. j = (j+1)
  1756. setreg
  1757. .endr
  1758. #######################################################################
  1759. vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
  1760. vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
  1761. vpxor \T3, \T7, \T7
  1762. vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
  1763. #######################################################################
  1764. #first phase of the reduction
  1765. vmovdqa POLY2(%rip), \T3
  1766. vpclmulqdq $0x01, \T7, \T3, \T2
  1767. vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
  1768. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  1769. #######################################################################
  1770. .if \ENC_DEC == ENC
  1771. vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
  1772. vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
  1773. vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
  1774. vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
  1775. vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
  1776. vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
  1777. vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
  1778. vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
  1779. .endif
  1780. #######################################################################
  1781. #second phase of the reduction
  1782. vpclmulqdq $0x00, \T7, \T3, \T2
  1783. vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
  1784. vpclmulqdq $0x10, \T7, \T3, \T4
  1785. vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
  1786. vpxor \T2, \T4, \T4 # second phase of the reduction complete
  1787. #######################################################################
  1788. vpxor \T4, \T1, \T1 # the result is in T1
  1789. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1790. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1791. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1792. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1793. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1794. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1795. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1796. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1797. vpxor \T1, \XMM1, \XMM1
  1798. .endm
  1799. # GHASH the last 4 ciphertext blocks.
  1800. .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
  1801. ## Karatsuba Method
  1802. vmovdqa HashKey_8(arg1), \T5
  1803. vpshufd $0b01001110, \XMM1, \T2
  1804. vpshufd $0b01001110, \T5, \T3
  1805. vpxor \XMM1, \T2, \T2
  1806. vpxor \T5, \T3, \T3
  1807. vpclmulqdq $0x11, \T5, \XMM1, \T6
  1808. vpclmulqdq $0x00, \T5, \XMM1, \T7
  1809. vpclmulqdq $0x00, \T3, \T2, \XMM1
  1810. ######################
  1811. vmovdqa HashKey_7(arg1), \T5
  1812. vpshufd $0b01001110, \XMM2, \T2
  1813. vpshufd $0b01001110, \T5, \T3
  1814. vpxor \XMM2, \T2, \T2
  1815. vpxor \T5, \T3, \T3
  1816. vpclmulqdq $0x11, \T5, \XMM2, \T4
  1817. vpxor \T4, \T6, \T6
  1818. vpclmulqdq $0x00, \T5, \XMM2, \T4
  1819. vpxor \T4, \T7, \T7
  1820. vpclmulqdq $0x00, \T3, \T2, \T2
  1821. vpxor \T2, \XMM1, \XMM1
  1822. ######################
  1823. vmovdqa HashKey_6(arg1), \T5
  1824. vpshufd $0b01001110, \XMM3, \T2
  1825. vpshufd $0b01001110, \T5, \T3
  1826. vpxor \XMM3, \T2, \T2
  1827. vpxor \T5, \T3, \T3
  1828. vpclmulqdq $0x11, \T5, \XMM3, \T4
  1829. vpxor \T4, \T6, \T6
  1830. vpclmulqdq $0x00, \T5, \XMM3, \T4
  1831. vpxor \T4, \T7, \T7
  1832. vpclmulqdq $0x00, \T3, \T2, \T2
  1833. vpxor \T2, \XMM1, \XMM1
  1834. ######################
  1835. vmovdqa HashKey_5(arg1), \T5
  1836. vpshufd $0b01001110, \XMM4, \T2
  1837. vpshufd $0b01001110, \T5, \T3
  1838. vpxor \XMM4, \T2, \T2
  1839. vpxor \T5, \T3, \T3
  1840. vpclmulqdq $0x11, \T5, \XMM4, \T4
  1841. vpxor \T4, \T6, \T6
  1842. vpclmulqdq $0x00, \T5, \XMM4, \T4
  1843. vpxor \T4, \T7, \T7
  1844. vpclmulqdq $0x00, \T3, \T2, \T2
  1845. vpxor \T2, \XMM1, \XMM1
  1846. ######################
  1847. vmovdqa HashKey_4(arg1), \T5
  1848. vpshufd $0b01001110, \XMM5, \T2
  1849. vpshufd $0b01001110, \T5, \T3
  1850. vpxor \XMM5, \T2, \T2
  1851. vpxor \T5, \T3, \T3
  1852. vpclmulqdq $0x11, \T5, \XMM5, \T4
  1853. vpxor \T4, \T6, \T6
  1854. vpclmulqdq $0x00, \T5, \XMM5, \T4
  1855. vpxor \T4, \T7, \T7
  1856. vpclmulqdq $0x00, \T3, \T2, \T2
  1857. vpxor \T2, \XMM1, \XMM1
  1858. ######################
  1859. vmovdqa HashKey_3(arg1), \T5
  1860. vpshufd $0b01001110, \XMM6, \T2
  1861. vpshufd $0b01001110, \T5, \T3
  1862. vpxor \XMM6, \T2, \T2
  1863. vpxor \T5, \T3, \T3
  1864. vpclmulqdq $0x11, \T5, \XMM6, \T4
  1865. vpxor \T4, \T6, \T6
  1866. vpclmulqdq $0x00, \T5, \XMM6, \T4
  1867. vpxor \T4, \T7, \T7
  1868. vpclmulqdq $0x00, \T3, \T2, \T2
  1869. vpxor \T2, \XMM1, \XMM1
  1870. ######################
  1871. vmovdqa HashKey_2(arg1), \T5
  1872. vpshufd $0b01001110, \XMM7, \T2
  1873. vpshufd $0b01001110, \T5, \T3
  1874. vpxor \XMM7, \T2, \T2
  1875. vpxor \T5, \T3, \T3
  1876. vpclmulqdq $0x11, \T5, \XMM7, \T4
  1877. vpxor \T4, \T6, \T6
  1878. vpclmulqdq $0x00, \T5, \XMM7, \T4
  1879. vpxor \T4, \T7, \T7
  1880. vpclmulqdq $0x00, \T3, \T2, \T2
  1881. vpxor \T2, \XMM1, \XMM1
  1882. ######################
  1883. vmovdqa HashKey(arg1), \T5
  1884. vpshufd $0b01001110, \XMM8, \T2
  1885. vpshufd $0b01001110, \T5, \T3
  1886. vpxor \XMM8, \T2, \T2
  1887. vpxor \T5, \T3, \T3
  1888. vpclmulqdq $0x11, \T5, \XMM8, \T4
  1889. vpxor \T4, \T6, \T6
  1890. vpclmulqdq $0x00, \T5, \XMM8, \T4
  1891. vpxor \T4, \T7, \T7
  1892. vpclmulqdq $0x00, \T3, \T2, \T2
  1893. vpxor \T2, \XMM1, \XMM1
  1894. vpxor \T6, \XMM1, \XMM1
  1895. vpxor \T7, \XMM1, \T2
  1896. vpslldq $8, \T2, \T4
  1897. vpsrldq $8, \T2, \T2
  1898. vpxor \T4, \T7, \T7
  1899. vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
  1900. # accumulated carry-less multiplications
  1901. #######################################################################
  1902. #first phase of the reduction
  1903. vmovdqa POLY2(%rip), \T3
  1904. vpclmulqdq $0x01, \T7, \T3, \T2
  1905. vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
  1906. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  1907. #######################################################################
  1908. #second phase of the reduction
  1909. vpclmulqdq $0x00, \T7, \T3, \T2
  1910. vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
  1911. vpclmulqdq $0x10, \T7, \T3, \T4
  1912. vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
  1913. vpxor \T2, \T4, \T4 # second phase of the reduction complete
  1914. #######################################################################
  1915. vpxor \T4, \T6, \T6 # the result is in T6
  1916. .endm
  1917. # combined for GCM encrypt and decrypt functions
  1918. # clobbering all xmm registers
  1919. # clobbering r10, r11, r12, r13, r14, r15
  1920. .macro GCM_ENC_DEC_AVX2 ENC_DEC
  1921. #the number of pushes must equal STACK_OFFSET
  1922. push %r12
  1923. push %r13
  1924. push %r14
  1925. push %r15
  1926. mov %rsp, %r14
  1927. sub $VARIABLE_OFFSET, %rsp
  1928. and $~63, %rsp # align rsp to 64 bytes
  1929. vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
  1930. mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
  1931. and $-16, %r13 # r13 = r13 - (r13 mod 16)
  1932. mov %r13, %r12
  1933. shr $4, %r12
  1934. and $7, %r12
  1935. jz _initial_num_blocks_is_0\@
  1936. cmp $7, %r12
  1937. je _initial_num_blocks_is_7\@
  1938. cmp $6, %r12
  1939. je _initial_num_blocks_is_6\@
  1940. cmp $5, %r12
  1941. je _initial_num_blocks_is_5\@
  1942. cmp $4, %r12
  1943. je _initial_num_blocks_is_4\@
  1944. cmp $3, %r12
  1945. je _initial_num_blocks_is_3\@
  1946. cmp $2, %r12
  1947. je _initial_num_blocks_is_2\@
  1948. jmp _initial_num_blocks_is_1\@
  1949. _initial_num_blocks_is_7\@:
  1950. INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1951. sub $16*7, %r13
  1952. jmp _initial_blocks_encrypted\@
  1953. _initial_num_blocks_is_6\@:
  1954. INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1955. sub $16*6, %r13
  1956. jmp _initial_blocks_encrypted\@
  1957. _initial_num_blocks_is_5\@:
  1958. INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1959. sub $16*5, %r13
  1960. jmp _initial_blocks_encrypted\@
  1961. _initial_num_blocks_is_4\@:
  1962. INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1963. sub $16*4, %r13
  1964. jmp _initial_blocks_encrypted\@
  1965. _initial_num_blocks_is_3\@:
  1966. INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1967. sub $16*3, %r13
  1968. jmp _initial_blocks_encrypted\@
  1969. _initial_num_blocks_is_2\@:
  1970. INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1971. sub $16*2, %r13
  1972. jmp _initial_blocks_encrypted\@
  1973. _initial_num_blocks_is_1\@:
  1974. INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1975. sub $16*1, %r13
  1976. jmp _initial_blocks_encrypted\@
  1977. _initial_num_blocks_is_0\@:
  1978. INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1979. _initial_blocks_encrypted\@:
  1980. cmp $0, %r13
  1981. je _zero_cipher_left\@
  1982. sub $128, %r13
  1983. je _eight_cipher_left\@
  1984. vmovd %xmm9, %r15d
  1985. and $255, %r15d
  1986. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1987. _encrypt_by_8_new\@:
  1988. cmp $(255-8), %r15d
  1989. jg _encrypt_by_8\@
  1990. add $8, %r15b
  1991. GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
  1992. add $128, %r11
  1993. sub $128, %r13
  1994. jne _encrypt_by_8_new\@
  1995. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1996. jmp _eight_cipher_left\@
  1997. _encrypt_by_8\@:
  1998. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1999. add $8, %r15b
  2000. GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
  2001. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2002. add $128, %r11
  2003. sub $128, %r13
  2004. jne _encrypt_by_8_new\@
  2005. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2006. _eight_cipher_left\@:
  2007. GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
  2008. _zero_cipher_left\@:
  2009. cmp $16, arg4
  2010. jl _only_less_than_16\@
  2011. mov arg4, %r13
  2012. and $15, %r13 # r13 = (arg4 mod 16)
  2013. je _multiple_of_16_bytes\@
  2014. # handle the last <16 Byte block seperately
  2015. vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
  2016. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2017. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
  2018. sub $16, %r11
  2019. add %r13, %r11
  2020. vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
  2021. lea SHIFT_MASK+16(%rip), %r12
  2022. sub %r13, %r12 # adjust the shuffle mask pointer
  2023. # to be able to shift 16-r13 bytes
  2024. # (r13 is the number of bytes in plaintext mod 16)
  2025. vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
  2026. vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
  2027. jmp _final_ghash_mul\@
  2028. _only_less_than_16\@:
  2029. # check for 0 length
  2030. mov arg4, %r13
  2031. and $15, %r13 # r13 = (arg4 mod 16)
  2032. je _multiple_of_16_bytes\@
  2033. # handle the last <16 Byte block seperately
  2034. vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
  2035. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2036. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
  2037. lea SHIFT_MASK+16(%rip), %r12
  2038. sub %r13, %r12 # adjust the shuffle mask pointer to be
  2039. # able to shift 16-r13 bytes (r13 is the
  2040. # number of bytes in plaintext mod 16)
  2041. _get_last_16_byte_loop\@:
  2042. movb (arg3, %r11), %al
  2043. movb %al, TMP1 (%rsp , %r11)
  2044. add $1, %r11
  2045. cmp %r13, %r11
  2046. jne _get_last_16_byte_loop\@
  2047. vmovdqu TMP1(%rsp), %xmm1
  2048. sub $16, %r11
  2049. _final_ghash_mul\@:
  2050. .if \ENC_DEC == DEC
  2051. vmovdqa %xmm1, %xmm2
  2052. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  2053. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
  2054. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  2055. vpand %xmm1, %xmm2, %xmm2
  2056. vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
  2057. vpxor %xmm2, %xmm14, %xmm14
  2058. #GHASH computation for the last <16 Byte block
  2059. GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  2060. sub %r13, %r11
  2061. add $16, %r11
  2062. .else
  2063. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  2064. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
  2065. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  2066. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2067. vpxor %xmm9, %xmm14, %xmm14
  2068. #GHASH computation for the last <16 Byte block
  2069. GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  2070. sub %r13, %r11
  2071. add $16, %r11
  2072. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
  2073. .endif
  2074. #############################
  2075. # output r13 Bytes
  2076. vmovq %xmm9, %rax
  2077. cmp $8, %r13
  2078. jle _less_than_8_bytes_left\@
  2079. mov %rax, (arg2 , %r11)
  2080. add $8, %r11
  2081. vpsrldq $8, %xmm9, %xmm9
  2082. vmovq %xmm9, %rax
  2083. sub $8, %r13
  2084. _less_than_8_bytes_left\@:
  2085. movb %al, (arg2 , %r11)
  2086. add $1, %r11
  2087. shr $8, %rax
  2088. sub $1, %r13
  2089. jne _less_than_8_bytes_left\@
  2090. #############################
  2091. _multiple_of_16_bytes\@:
  2092. mov arg7, %r12 # r12 = aadLen (number of bytes)
  2093. shl $3, %r12 # convert into number of bits
  2094. vmovd %r12d, %xmm15 # len(A) in xmm15
  2095. shl $3, arg4 # len(C) in bits (*128)
  2096. vmovq arg4, %xmm1
  2097. vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
  2098. vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
  2099. vpxor %xmm15, %xmm14, %xmm14
  2100. GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
  2101. vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
  2102. mov arg5, %rax # rax = *Y0
  2103. vmovdqu (%rax), %xmm9 # xmm9 = Y0
  2104. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
  2105. vpxor %xmm14, %xmm9, %xmm9
  2106. _return_T\@:
  2107. mov arg8, %r10 # r10 = authTag
  2108. mov arg9, %r11 # r11 = auth_tag_len
  2109. cmp $16, %r11
  2110. je _T_16\@
  2111. cmp $12, %r11
  2112. je _T_12\@
  2113. _T_8\@:
  2114. vmovq %xmm9, %rax
  2115. mov %rax, (%r10)
  2116. jmp _return_T_done\@
  2117. _T_12\@:
  2118. vmovq %xmm9, %rax
  2119. mov %rax, (%r10)
  2120. vpsrldq $8, %xmm9, %xmm9
  2121. vmovd %xmm9, %eax
  2122. mov %eax, 8(%r10)
  2123. jmp _return_T_done\@
  2124. _T_16\@:
  2125. vmovdqu %xmm9, (%r10)
  2126. _return_T_done\@:
  2127. mov %r14, %rsp
  2128. pop %r15
  2129. pop %r14
  2130. pop %r13
  2131. pop %r12
  2132. .endm
  2133. #############################################################
  2134. #void aesni_gcm_precomp_avx_gen4
  2135. # (gcm_data *my_ctx_data,
  2136. # u8 *hash_subkey)# /* H, the Hash sub key input.
  2137. # Data starts on a 16-byte boundary. */
  2138. #############################################################
  2139. ENTRY(aesni_gcm_precomp_avx_gen4)
  2140. #the number of pushes must equal STACK_OFFSET
  2141. push %r12
  2142. push %r13
  2143. push %r14
  2144. push %r15
  2145. mov %rsp, %r14
  2146. sub $VARIABLE_OFFSET, %rsp
  2147. and $~63, %rsp # align rsp to 64 bytes
  2148. vmovdqu (arg2), %xmm6 # xmm6 = HashKey
  2149. vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
  2150. ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
  2151. vmovdqa %xmm6, %xmm2
  2152. vpsllq $1, %xmm6, %xmm6
  2153. vpsrlq $63, %xmm2, %xmm2
  2154. vmovdqa %xmm2, %xmm1
  2155. vpslldq $8, %xmm2, %xmm2
  2156. vpsrldq $8, %xmm1, %xmm1
  2157. vpor %xmm2, %xmm6, %xmm6
  2158. #reduction
  2159. vpshufd $0b00100100, %xmm1, %xmm2
  2160. vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
  2161. vpand POLY(%rip), %xmm2, %xmm2
  2162. vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
  2163. #######################################################################
  2164. vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
  2165. PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
  2166. mov %r14, %rsp
  2167. pop %r15
  2168. pop %r14
  2169. pop %r13
  2170. pop %r12
  2171. ret
  2172. ENDPROC(aesni_gcm_precomp_avx_gen4)
  2173. ###############################################################################
  2174. #void aesni_gcm_enc_avx_gen4(
  2175. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  2176. # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
  2177. # const u8 *in, /* Plaintext input */
  2178. # u64 plaintext_len, /* Length of data in Bytes for encryption. */
  2179. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  2180. # (from Security Association) concatenated with 8 byte
  2181. # Initialisation Vector (from IPSec ESP Payload)
  2182. # concatenated with 0x00000001. 16-byte aligned pointer. */
  2183. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  2184. # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  2185. # u8 *auth_tag, /* Authenticated Tag output. */
  2186. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  2187. # Valid values are 16 (most likely), 12 or 8. */
  2188. ###############################################################################
  2189. ENTRY(aesni_gcm_enc_avx_gen4)
  2190. GCM_ENC_DEC_AVX2 ENC
  2191. ret
  2192. ENDPROC(aesni_gcm_enc_avx_gen4)
  2193. ###############################################################################
  2194. #void aesni_gcm_dec_avx_gen4(
  2195. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  2196. # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
  2197. # const u8 *in, /* Ciphertext input */
  2198. # u64 plaintext_len, /* Length of data in Bytes for encryption. */
  2199. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  2200. # (from Security Association) concatenated with 8 byte
  2201. # Initialisation Vector (from IPSec ESP Payload)
  2202. # concatenated with 0x00000001. 16-byte aligned pointer. */
  2203. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  2204. # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  2205. # u8 *auth_tag, /* Authenticated Tag output. */
  2206. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  2207. # Valid values are 16 (most likely), 12 or 8. */
  2208. ###############################################################################
  2209. ENTRY(aesni_gcm_dec_avx_gen4)
  2210. GCM_ENC_DEC_AVX2 DEC
  2211. ret
  2212. ENDPROC(aesni_gcm_dec_avx_gen4)
  2213. #endif /* CONFIG_AS_AVX2 */