ipoib_main.c 56 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271
  1. /*
  2. * Copyright (c) 2004 Topspin Communications. All rights reserved.
  3. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
  4. * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
  5. *
  6. * This software is available to you under a choice of one of two
  7. * licenses. You may choose to be licensed under the terms of the GNU
  8. * General Public License (GPL) Version 2, available from the file
  9. * COPYING in the main directory of this source tree, or the
  10. * OpenIB.org BSD license below:
  11. *
  12. * Redistribution and use in source and binary forms, with or
  13. * without modification, are permitted provided that the following
  14. * conditions are met:
  15. *
  16. * - Redistributions of source code must retain the above
  17. * copyright notice, this list of conditions and the following
  18. * disclaimer.
  19. *
  20. * - Redistributions in binary form must reproduce the above
  21. * copyright notice, this list of conditions and the following
  22. * disclaimer in the documentation and/or other materials
  23. * provided with the distribution.
  24. *
  25. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32. * SOFTWARE.
  33. */
  34. #include "ipoib.h"
  35. #include <linux/module.h>
  36. #include <linux/init.h>
  37. #include <linux/slab.h>
  38. #include <linux/kernel.h>
  39. #include <linux/vmalloc.h>
  40. #include <linux/if_arp.h> /* For ARPHRD_xxx */
  41. #include <linux/ip.h>
  42. #include <linux/in.h>
  43. #include <linux/jhash.h>
  44. #include <net/arp.h>
  45. #include <net/addrconf.h>
  46. #include <linux/inetdevice.h>
  47. #include <rdma/ib_cache.h>
  48. #include <linux/pci.h>
  49. #define DRV_VERSION "1.0.0"
  50. const char ipoib_driver_version[] = DRV_VERSION;
  51. MODULE_AUTHOR("Roland Dreier");
  52. MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
  53. MODULE_LICENSE("Dual BSD/GPL");
  54. MODULE_VERSION(DRV_VERSION);
  55. int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
  56. int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
  57. module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
  58. MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
  59. module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
  60. MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
  61. #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  62. int ipoib_debug_level;
  63. module_param_named(debug_level, ipoib_debug_level, int, 0644);
  64. MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
  65. #endif
  66. struct ipoib_path_iter {
  67. struct net_device *dev;
  68. struct ipoib_path path;
  69. };
  70. static const u8 ipv4_bcast_addr[] = {
  71. 0x00, 0xff, 0xff, 0xff,
  72. 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  73. 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
  74. };
  75. struct workqueue_struct *ipoib_workqueue;
  76. struct ib_sa_client ipoib_sa_client;
  77. static void ipoib_add_one(struct ib_device *device);
  78. static void ipoib_remove_one(struct ib_device *device, void *client_data);
  79. static void ipoib_neigh_reclaim(struct rcu_head *rp);
  80. static struct net_device *ipoib_get_net_dev_by_params(
  81. struct ib_device *dev, u8 port, u16 pkey,
  82. const union ib_gid *gid, const struct sockaddr *addr,
  83. void *client_data);
  84. static int ipoib_set_mac(struct net_device *dev, void *addr);
  85. static struct ib_client ipoib_client = {
  86. .name = "ipoib",
  87. .add = ipoib_add_one,
  88. .remove = ipoib_remove_one,
  89. .get_net_dev_by_params = ipoib_get_net_dev_by_params,
  90. };
  91. int ipoib_open(struct net_device *dev)
  92. {
  93. struct ipoib_dev_priv *priv = netdev_priv(dev);
  94. ipoib_dbg(priv, "bringing up interface\n");
  95. netif_carrier_off(dev);
  96. set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
  97. priv->sm_fullmember_sendonly_support = false;
  98. if (ipoib_ib_dev_open(dev)) {
  99. if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
  100. return 0;
  101. goto err_disable;
  102. }
  103. ipoib_ib_dev_up(dev);
  104. if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
  105. struct ipoib_dev_priv *cpriv;
  106. /* Bring up any child interfaces too */
  107. down_read(&priv->vlan_rwsem);
  108. list_for_each_entry(cpriv, &priv->child_intfs, list) {
  109. int flags;
  110. flags = cpriv->dev->flags;
  111. if (flags & IFF_UP)
  112. continue;
  113. dev_change_flags(cpriv->dev, flags | IFF_UP);
  114. }
  115. up_read(&priv->vlan_rwsem);
  116. }
  117. netif_start_queue(dev);
  118. return 0;
  119. err_disable:
  120. clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
  121. return -EINVAL;
  122. }
  123. static int ipoib_stop(struct net_device *dev)
  124. {
  125. struct ipoib_dev_priv *priv = netdev_priv(dev);
  126. ipoib_dbg(priv, "stopping interface\n");
  127. clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
  128. netif_stop_queue(dev);
  129. ipoib_ib_dev_down(dev);
  130. ipoib_ib_dev_stop(dev);
  131. if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
  132. struct ipoib_dev_priv *cpriv;
  133. /* Bring down any child interfaces too */
  134. down_read(&priv->vlan_rwsem);
  135. list_for_each_entry(cpriv, &priv->child_intfs, list) {
  136. int flags;
  137. flags = cpriv->dev->flags;
  138. if (!(flags & IFF_UP))
  139. continue;
  140. dev_change_flags(cpriv->dev, flags & ~IFF_UP);
  141. }
  142. up_read(&priv->vlan_rwsem);
  143. }
  144. return 0;
  145. }
  146. static void ipoib_uninit(struct net_device *dev)
  147. {
  148. ipoib_dev_cleanup(dev);
  149. }
  150. static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
  151. {
  152. struct ipoib_dev_priv *priv = netdev_priv(dev);
  153. if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
  154. features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
  155. return features;
  156. }
  157. static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
  158. {
  159. struct ipoib_dev_priv *priv = netdev_priv(dev);
  160. /* dev->mtu > 2K ==> connected mode */
  161. if (ipoib_cm_admin_enabled(dev)) {
  162. if (new_mtu > ipoib_cm_max_mtu(dev))
  163. return -EINVAL;
  164. if (new_mtu > priv->mcast_mtu)
  165. ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
  166. priv->mcast_mtu);
  167. dev->mtu = new_mtu;
  168. return 0;
  169. }
  170. if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
  171. return -EINVAL;
  172. priv->admin_mtu = new_mtu;
  173. if (priv->mcast_mtu < priv->admin_mtu)
  174. ipoib_dbg(priv, "MTU must be smaller than the underlying "
  175. "link layer MTU - 4 (%u)\n", priv->mcast_mtu);
  176. dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
  177. return 0;
  178. }
  179. /* Called with an RCU read lock taken */
  180. static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
  181. struct net_device *dev)
  182. {
  183. struct net *net = dev_net(dev);
  184. struct in_device *in_dev;
  185. struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
  186. struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
  187. __be32 ret_addr;
  188. switch (addr->sa_family) {
  189. case AF_INET:
  190. in_dev = in_dev_get(dev);
  191. if (!in_dev)
  192. return false;
  193. ret_addr = inet_confirm_addr(net, in_dev, 0,
  194. addr_in->sin_addr.s_addr,
  195. RT_SCOPE_HOST);
  196. in_dev_put(in_dev);
  197. if (ret_addr)
  198. return true;
  199. break;
  200. case AF_INET6:
  201. if (IS_ENABLED(CONFIG_IPV6) &&
  202. ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
  203. return true;
  204. break;
  205. }
  206. return false;
  207. }
  208. /**
  209. * Find the master net_device on top of the given net_device.
  210. * @dev: base IPoIB net_device
  211. *
  212. * Returns the master net_device with a reference held, or the same net_device
  213. * if no master exists.
  214. */
  215. static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
  216. {
  217. struct net_device *master;
  218. rcu_read_lock();
  219. master = netdev_master_upper_dev_get_rcu(dev);
  220. if (master)
  221. dev_hold(master);
  222. rcu_read_unlock();
  223. if (master)
  224. return master;
  225. dev_hold(dev);
  226. return dev;
  227. }
  228. struct ipoib_walk_data {
  229. const struct sockaddr *addr;
  230. struct net_device *result;
  231. };
  232. static int ipoib_upper_walk(struct net_device *upper, void *_data)
  233. {
  234. struct ipoib_walk_data *data = _data;
  235. int ret = 0;
  236. if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) {
  237. dev_hold(upper);
  238. data->result = upper;
  239. ret = 1;
  240. }
  241. return ret;
  242. }
  243. /**
  244. * Find a net_device matching the given address, which is an upper device of
  245. * the given net_device.
  246. * @addr: IP address to look for.
  247. * @dev: base IPoIB net_device
  248. *
  249. * If found, returns the net_device with a reference held. Otherwise return
  250. * NULL.
  251. */
  252. static struct net_device *ipoib_get_net_dev_match_addr(
  253. const struct sockaddr *addr, struct net_device *dev)
  254. {
  255. struct ipoib_walk_data data = {
  256. .addr = addr,
  257. };
  258. rcu_read_lock();
  259. if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
  260. dev_hold(dev);
  261. data.result = dev;
  262. goto out;
  263. }
  264. netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &data);
  265. out:
  266. rcu_read_unlock();
  267. return data.result;
  268. }
  269. /* returns the number of IPoIB netdevs on top a given ipoib device matching a
  270. * pkey_index and address, if one exists.
  271. *
  272. * @found_net_dev: contains a matching net_device if the return value >= 1,
  273. * with a reference held. */
  274. static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
  275. const union ib_gid *gid,
  276. u16 pkey_index,
  277. const struct sockaddr *addr,
  278. int nesting,
  279. struct net_device **found_net_dev)
  280. {
  281. struct ipoib_dev_priv *child_priv;
  282. struct net_device *net_dev = NULL;
  283. int matches = 0;
  284. if (priv->pkey_index == pkey_index &&
  285. (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
  286. if (!addr) {
  287. net_dev = ipoib_get_master_net_dev(priv->dev);
  288. } else {
  289. /* Verify the net_device matches the IP address, as
  290. * IPoIB child devices currently share a GID. */
  291. net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
  292. }
  293. if (net_dev) {
  294. if (!*found_net_dev)
  295. *found_net_dev = net_dev;
  296. else
  297. dev_put(net_dev);
  298. ++matches;
  299. }
  300. }
  301. /* Check child interfaces */
  302. down_read_nested(&priv->vlan_rwsem, nesting);
  303. list_for_each_entry(child_priv, &priv->child_intfs, list) {
  304. matches += ipoib_match_gid_pkey_addr(child_priv, gid,
  305. pkey_index, addr,
  306. nesting + 1,
  307. found_net_dev);
  308. if (matches > 1)
  309. break;
  310. }
  311. up_read(&priv->vlan_rwsem);
  312. return matches;
  313. }
  314. /* Returns the number of matching net_devs found (between 0 and 2). Also
  315. * return the matching net_device in the @net_dev parameter, holding a
  316. * reference to the net_device, if the number of matches >= 1 */
  317. static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
  318. u16 pkey_index,
  319. const union ib_gid *gid,
  320. const struct sockaddr *addr,
  321. struct net_device **net_dev)
  322. {
  323. struct ipoib_dev_priv *priv;
  324. int matches = 0;
  325. *net_dev = NULL;
  326. list_for_each_entry(priv, dev_list, list) {
  327. if (priv->port != port)
  328. continue;
  329. matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
  330. addr, 0, net_dev);
  331. if (matches > 1)
  332. break;
  333. }
  334. return matches;
  335. }
  336. static struct net_device *ipoib_get_net_dev_by_params(
  337. struct ib_device *dev, u8 port, u16 pkey,
  338. const union ib_gid *gid, const struct sockaddr *addr,
  339. void *client_data)
  340. {
  341. struct net_device *net_dev;
  342. struct list_head *dev_list = client_data;
  343. u16 pkey_index;
  344. int matches;
  345. int ret;
  346. if (!rdma_protocol_ib(dev, port))
  347. return NULL;
  348. ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
  349. if (ret)
  350. return NULL;
  351. if (!dev_list)
  352. return NULL;
  353. /* See if we can find a unique device matching the L2 parameters */
  354. matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
  355. gid, NULL, &net_dev);
  356. switch (matches) {
  357. case 0:
  358. return NULL;
  359. case 1:
  360. return net_dev;
  361. }
  362. dev_put(net_dev);
  363. /* Couldn't find a unique device with L2 parameters only. Use L3
  364. * address to uniquely match the net device */
  365. matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
  366. gid, addr, &net_dev);
  367. switch (matches) {
  368. case 0:
  369. return NULL;
  370. default:
  371. dev_warn_ratelimited(&dev->dev,
  372. "duplicate IP address detected\n");
  373. /* Fall through */
  374. case 1:
  375. return net_dev;
  376. }
  377. }
  378. int ipoib_set_mode(struct net_device *dev, const char *buf)
  379. {
  380. struct ipoib_dev_priv *priv = netdev_priv(dev);
  381. if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
  382. !strcmp(buf, "connected\n")) ||
  383. (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
  384. !strcmp(buf, "datagram\n"))) {
  385. return 0;
  386. }
  387. /* flush paths if we switch modes so that connections are restarted */
  388. if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
  389. set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
  390. ipoib_warn(priv, "enabling connected mode "
  391. "will cause multicast packet drops\n");
  392. netdev_update_features(dev);
  393. dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
  394. rtnl_unlock();
  395. priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
  396. ipoib_flush_paths(dev);
  397. return (!rtnl_trylock()) ? -EBUSY : 0;
  398. }
  399. if (!strcmp(buf, "datagram\n")) {
  400. clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
  401. netdev_update_features(dev);
  402. dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
  403. rtnl_unlock();
  404. ipoib_flush_paths(dev);
  405. return (!rtnl_trylock()) ? -EBUSY : 0;
  406. }
  407. return -EINVAL;
  408. }
  409. struct ipoib_path *__path_find(struct net_device *dev, void *gid)
  410. {
  411. struct ipoib_dev_priv *priv = netdev_priv(dev);
  412. struct rb_node *n = priv->path_tree.rb_node;
  413. struct ipoib_path *path;
  414. int ret;
  415. while (n) {
  416. path = rb_entry(n, struct ipoib_path, rb_node);
  417. ret = memcmp(gid, path->pathrec.dgid.raw,
  418. sizeof (union ib_gid));
  419. if (ret < 0)
  420. n = n->rb_left;
  421. else if (ret > 0)
  422. n = n->rb_right;
  423. else
  424. return path;
  425. }
  426. return NULL;
  427. }
  428. static int __path_add(struct net_device *dev, struct ipoib_path *path)
  429. {
  430. struct ipoib_dev_priv *priv = netdev_priv(dev);
  431. struct rb_node **n = &priv->path_tree.rb_node;
  432. struct rb_node *pn = NULL;
  433. struct ipoib_path *tpath;
  434. int ret;
  435. while (*n) {
  436. pn = *n;
  437. tpath = rb_entry(pn, struct ipoib_path, rb_node);
  438. ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
  439. sizeof (union ib_gid));
  440. if (ret < 0)
  441. n = &pn->rb_left;
  442. else if (ret > 0)
  443. n = &pn->rb_right;
  444. else
  445. return -EEXIST;
  446. }
  447. rb_link_node(&path->rb_node, pn, n);
  448. rb_insert_color(&path->rb_node, &priv->path_tree);
  449. list_add_tail(&path->list, &priv->path_list);
  450. return 0;
  451. }
  452. static void path_free(struct net_device *dev, struct ipoib_path *path)
  453. {
  454. struct sk_buff *skb;
  455. while ((skb = __skb_dequeue(&path->queue)))
  456. dev_kfree_skb_irq(skb);
  457. ipoib_dbg(netdev_priv(dev), "path_free\n");
  458. /* remove all neigh connected to this path */
  459. ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
  460. if (path->ah)
  461. ipoib_put_ah(path->ah);
  462. kfree(path);
  463. }
  464. #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  465. struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
  466. {
  467. struct ipoib_path_iter *iter;
  468. iter = kmalloc(sizeof *iter, GFP_KERNEL);
  469. if (!iter)
  470. return NULL;
  471. iter->dev = dev;
  472. memset(iter->path.pathrec.dgid.raw, 0, 16);
  473. if (ipoib_path_iter_next(iter)) {
  474. kfree(iter);
  475. return NULL;
  476. }
  477. return iter;
  478. }
  479. int ipoib_path_iter_next(struct ipoib_path_iter *iter)
  480. {
  481. struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
  482. struct rb_node *n;
  483. struct ipoib_path *path;
  484. int ret = 1;
  485. spin_lock_irq(&priv->lock);
  486. n = rb_first(&priv->path_tree);
  487. while (n) {
  488. path = rb_entry(n, struct ipoib_path, rb_node);
  489. if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
  490. sizeof (union ib_gid)) < 0) {
  491. iter->path = *path;
  492. ret = 0;
  493. break;
  494. }
  495. n = rb_next(n);
  496. }
  497. spin_unlock_irq(&priv->lock);
  498. return ret;
  499. }
  500. void ipoib_path_iter_read(struct ipoib_path_iter *iter,
  501. struct ipoib_path *path)
  502. {
  503. *path = iter->path;
  504. }
  505. #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
  506. void ipoib_mark_paths_invalid(struct net_device *dev)
  507. {
  508. struct ipoib_dev_priv *priv = netdev_priv(dev);
  509. struct ipoib_path *path, *tp;
  510. spin_lock_irq(&priv->lock);
  511. list_for_each_entry_safe(path, tp, &priv->path_list, list) {
  512. ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
  513. be16_to_cpu(path->pathrec.dlid),
  514. path->pathrec.dgid.raw);
  515. path->valid = 0;
  516. }
  517. spin_unlock_irq(&priv->lock);
  518. }
  519. struct classport_info_context {
  520. struct ipoib_dev_priv *priv;
  521. struct completion done;
  522. struct ib_sa_query *sa_query;
  523. };
  524. static void classport_info_query_cb(int status, struct ib_class_port_info *rec,
  525. void *context)
  526. {
  527. struct classport_info_context *cb_ctx = context;
  528. struct ipoib_dev_priv *priv;
  529. WARN_ON(!context);
  530. priv = cb_ctx->priv;
  531. if (status || !rec) {
  532. pr_debug("device: %s failed query classport_info status: %d\n",
  533. priv->dev->name, status);
  534. /* keeps the default, will try next mcast_restart */
  535. priv->sm_fullmember_sendonly_support = false;
  536. goto out;
  537. }
  538. if (ib_get_cpi_capmask2(rec) &
  539. IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT) {
  540. pr_debug("device: %s enabled fullmember-sendonly for sendonly MCG\n",
  541. priv->dev->name);
  542. priv->sm_fullmember_sendonly_support = true;
  543. } else {
  544. pr_debug("device: %s disabled fullmember-sendonly for sendonly MCG\n",
  545. priv->dev->name);
  546. priv->sm_fullmember_sendonly_support = false;
  547. }
  548. out:
  549. complete(&cb_ctx->done);
  550. }
  551. int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv)
  552. {
  553. struct classport_info_context *callback_context;
  554. int ret;
  555. callback_context = kmalloc(sizeof(*callback_context), GFP_KERNEL);
  556. if (!callback_context)
  557. return -ENOMEM;
  558. callback_context->priv = priv;
  559. init_completion(&callback_context->done);
  560. ret = ib_sa_classport_info_rec_query(&ipoib_sa_client,
  561. priv->ca, priv->port, 3000,
  562. GFP_KERNEL,
  563. classport_info_query_cb,
  564. callback_context,
  565. &callback_context->sa_query);
  566. if (ret < 0) {
  567. pr_info("%s failed to send ib_sa_classport_info query, ret: %d\n",
  568. priv->dev->name, ret);
  569. kfree(callback_context);
  570. return ret;
  571. }
  572. /* waiting for the callback to finish before returnning */
  573. wait_for_completion(&callback_context->done);
  574. kfree(callback_context);
  575. return ret;
  576. }
  577. static void push_pseudo_header(struct sk_buff *skb, const char *daddr)
  578. {
  579. struct ipoib_pseudo_header *phdr;
  580. phdr = (struct ipoib_pseudo_header *)skb_push(skb, sizeof(*phdr));
  581. memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
  582. }
  583. void ipoib_flush_paths(struct net_device *dev)
  584. {
  585. struct ipoib_dev_priv *priv = netdev_priv(dev);
  586. struct ipoib_path *path, *tp;
  587. LIST_HEAD(remove_list);
  588. unsigned long flags;
  589. netif_tx_lock_bh(dev);
  590. spin_lock_irqsave(&priv->lock, flags);
  591. list_splice_init(&priv->path_list, &remove_list);
  592. list_for_each_entry(path, &remove_list, list)
  593. rb_erase(&path->rb_node, &priv->path_tree);
  594. list_for_each_entry_safe(path, tp, &remove_list, list) {
  595. if (path->query)
  596. ib_sa_cancel_query(path->query_id, path->query);
  597. spin_unlock_irqrestore(&priv->lock, flags);
  598. netif_tx_unlock_bh(dev);
  599. wait_for_completion(&path->done);
  600. path_free(dev, path);
  601. netif_tx_lock_bh(dev);
  602. spin_lock_irqsave(&priv->lock, flags);
  603. }
  604. spin_unlock_irqrestore(&priv->lock, flags);
  605. netif_tx_unlock_bh(dev);
  606. }
  607. static void path_rec_completion(int status,
  608. struct ib_sa_path_rec *pathrec,
  609. void *path_ptr)
  610. {
  611. struct ipoib_path *path = path_ptr;
  612. struct net_device *dev = path->dev;
  613. struct ipoib_dev_priv *priv = netdev_priv(dev);
  614. struct ipoib_ah *ah = NULL;
  615. struct ipoib_ah *old_ah = NULL;
  616. struct ipoib_neigh *neigh, *tn;
  617. struct sk_buff_head skqueue;
  618. struct sk_buff *skb;
  619. unsigned long flags;
  620. if (!status)
  621. ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
  622. be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
  623. else
  624. ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
  625. status, path->pathrec.dgid.raw);
  626. skb_queue_head_init(&skqueue);
  627. if (!status) {
  628. struct ib_ah_attr av;
  629. if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
  630. ah = ipoib_create_ah(dev, priv->pd, &av);
  631. }
  632. spin_lock_irqsave(&priv->lock, flags);
  633. if (!IS_ERR_OR_NULL(ah)) {
  634. path->pathrec = *pathrec;
  635. old_ah = path->ah;
  636. path->ah = ah;
  637. ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
  638. ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
  639. while ((skb = __skb_dequeue(&path->queue)))
  640. __skb_queue_tail(&skqueue, skb);
  641. list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
  642. if (neigh->ah) {
  643. WARN_ON(neigh->ah != old_ah);
  644. /*
  645. * Dropping the ah reference inside
  646. * priv->lock is safe here, because we
  647. * will hold one more reference from
  648. * the original value of path->ah (ie
  649. * old_ah).
  650. */
  651. ipoib_put_ah(neigh->ah);
  652. }
  653. kref_get(&path->ah->ref);
  654. neigh->ah = path->ah;
  655. if (ipoib_cm_enabled(dev, neigh->daddr)) {
  656. if (!ipoib_cm_get(neigh))
  657. ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
  658. path,
  659. neigh));
  660. if (!ipoib_cm_get(neigh)) {
  661. ipoib_neigh_free(neigh);
  662. continue;
  663. }
  664. }
  665. while ((skb = __skb_dequeue(&neigh->queue)))
  666. __skb_queue_tail(&skqueue, skb);
  667. }
  668. path->valid = 1;
  669. }
  670. path->query = NULL;
  671. complete(&path->done);
  672. spin_unlock_irqrestore(&priv->lock, flags);
  673. if (IS_ERR_OR_NULL(ah))
  674. ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
  675. if (old_ah)
  676. ipoib_put_ah(old_ah);
  677. while ((skb = __skb_dequeue(&skqueue))) {
  678. int ret;
  679. skb->dev = dev;
  680. ret = dev_queue_xmit(skb);
  681. if (ret)
  682. ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n",
  683. __func__, ret);
  684. }
  685. }
  686. static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
  687. {
  688. struct ipoib_dev_priv *priv = netdev_priv(dev);
  689. struct ipoib_path *path;
  690. if (!priv->broadcast)
  691. return NULL;
  692. path = kzalloc(sizeof *path, GFP_ATOMIC);
  693. if (!path)
  694. return NULL;
  695. path->dev = dev;
  696. skb_queue_head_init(&path->queue);
  697. INIT_LIST_HEAD(&path->neigh_list);
  698. memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
  699. path->pathrec.sgid = priv->local_gid;
  700. path->pathrec.pkey = cpu_to_be16(priv->pkey);
  701. path->pathrec.numb_path = 1;
  702. path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
  703. return path;
  704. }
  705. static int path_rec_start(struct net_device *dev,
  706. struct ipoib_path *path)
  707. {
  708. struct ipoib_dev_priv *priv = netdev_priv(dev);
  709. ipoib_dbg(priv, "Start path record lookup for %pI6\n",
  710. path->pathrec.dgid.raw);
  711. init_completion(&path->done);
  712. path->query_id =
  713. ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
  714. &path->pathrec,
  715. IB_SA_PATH_REC_DGID |
  716. IB_SA_PATH_REC_SGID |
  717. IB_SA_PATH_REC_NUMB_PATH |
  718. IB_SA_PATH_REC_TRAFFIC_CLASS |
  719. IB_SA_PATH_REC_PKEY,
  720. 1000, GFP_ATOMIC,
  721. path_rec_completion,
  722. path, &path->query);
  723. if (path->query_id < 0) {
  724. ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
  725. path->query = NULL;
  726. complete(&path->done);
  727. return path->query_id;
  728. }
  729. return 0;
  730. }
  731. static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
  732. struct net_device *dev)
  733. {
  734. struct ipoib_dev_priv *priv = netdev_priv(dev);
  735. struct ipoib_path *path;
  736. struct ipoib_neigh *neigh;
  737. unsigned long flags;
  738. spin_lock_irqsave(&priv->lock, flags);
  739. neigh = ipoib_neigh_alloc(daddr, dev);
  740. if (!neigh) {
  741. spin_unlock_irqrestore(&priv->lock, flags);
  742. ++dev->stats.tx_dropped;
  743. dev_kfree_skb_any(skb);
  744. return;
  745. }
  746. path = __path_find(dev, daddr + 4);
  747. if (!path) {
  748. path = path_rec_create(dev, daddr + 4);
  749. if (!path)
  750. goto err_path;
  751. __path_add(dev, path);
  752. }
  753. list_add_tail(&neigh->list, &path->neigh_list);
  754. if (path->ah) {
  755. kref_get(&path->ah->ref);
  756. neigh->ah = path->ah;
  757. if (ipoib_cm_enabled(dev, neigh->daddr)) {
  758. if (!ipoib_cm_get(neigh))
  759. ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
  760. if (!ipoib_cm_get(neigh)) {
  761. ipoib_neigh_free(neigh);
  762. goto err_drop;
  763. }
  764. if (skb_queue_len(&neigh->queue) <
  765. IPOIB_MAX_PATH_REC_QUEUE) {
  766. push_pseudo_header(skb, neigh->daddr);
  767. __skb_queue_tail(&neigh->queue, skb);
  768. } else {
  769. ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
  770. skb_queue_len(&neigh->queue));
  771. goto err_drop;
  772. }
  773. } else {
  774. spin_unlock_irqrestore(&priv->lock, flags);
  775. ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr));
  776. ipoib_neigh_put(neigh);
  777. return;
  778. }
  779. } else {
  780. neigh->ah = NULL;
  781. if (!path->query && path_rec_start(dev, path))
  782. goto err_path;
  783. if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
  784. push_pseudo_header(skb, neigh->daddr);
  785. __skb_queue_tail(&neigh->queue, skb);
  786. } else {
  787. goto err_drop;
  788. }
  789. }
  790. spin_unlock_irqrestore(&priv->lock, flags);
  791. ipoib_neigh_put(neigh);
  792. return;
  793. err_path:
  794. ipoib_neigh_free(neigh);
  795. err_drop:
  796. ++dev->stats.tx_dropped;
  797. dev_kfree_skb_any(skb);
  798. spin_unlock_irqrestore(&priv->lock, flags);
  799. ipoib_neigh_put(neigh);
  800. }
  801. static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
  802. struct ipoib_pseudo_header *phdr)
  803. {
  804. struct ipoib_dev_priv *priv = netdev_priv(dev);
  805. struct ipoib_path *path;
  806. unsigned long flags;
  807. spin_lock_irqsave(&priv->lock, flags);
  808. path = __path_find(dev, phdr->hwaddr + 4);
  809. if (!path || !path->valid) {
  810. int new_path = 0;
  811. if (!path) {
  812. path = path_rec_create(dev, phdr->hwaddr + 4);
  813. new_path = 1;
  814. }
  815. if (path) {
  816. if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
  817. push_pseudo_header(skb, phdr->hwaddr);
  818. __skb_queue_tail(&path->queue, skb);
  819. } else {
  820. ++dev->stats.tx_dropped;
  821. dev_kfree_skb_any(skb);
  822. }
  823. if (!path->query && path_rec_start(dev, path)) {
  824. spin_unlock_irqrestore(&priv->lock, flags);
  825. if (new_path)
  826. path_free(dev, path);
  827. return;
  828. } else
  829. __path_add(dev, path);
  830. } else {
  831. ++dev->stats.tx_dropped;
  832. dev_kfree_skb_any(skb);
  833. }
  834. spin_unlock_irqrestore(&priv->lock, flags);
  835. return;
  836. }
  837. if (path->ah) {
  838. ipoib_dbg(priv, "Send unicast ARP to %04x\n",
  839. be16_to_cpu(path->pathrec.dlid));
  840. spin_unlock_irqrestore(&priv->lock, flags);
  841. ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
  842. return;
  843. } else if ((path->query || !path_rec_start(dev, path)) &&
  844. skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
  845. push_pseudo_header(skb, phdr->hwaddr);
  846. __skb_queue_tail(&path->queue, skb);
  847. } else {
  848. ++dev->stats.tx_dropped;
  849. dev_kfree_skb_any(skb);
  850. }
  851. spin_unlock_irqrestore(&priv->lock, flags);
  852. }
  853. static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
  854. {
  855. struct ipoib_dev_priv *priv = netdev_priv(dev);
  856. struct ipoib_neigh *neigh;
  857. struct ipoib_pseudo_header *phdr;
  858. struct ipoib_header *header;
  859. unsigned long flags;
  860. phdr = (struct ipoib_pseudo_header *) skb->data;
  861. skb_pull(skb, sizeof(*phdr));
  862. header = (struct ipoib_header *) skb->data;
  863. if (unlikely(phdr->hwaddr[4] == 0xff)) {
  864. /* multicast, arrange "if" according to probability */
  865. if ((header->proto != htons(ETH_P_IP)) &&
  866. (header->proto != htons(ETH_P_IPV6)) &&
  867. (header->proto != htons(ETH_P_ARP)) &&
  868. (header->proto != htons(ETH_P_RARP)) &&
  869. (header->proto != htons(ETH_P_TIPC))) {
  870. /* ethertype not supported by IPoIB */
  871. ++dev->stats.tx_dropped;
  872. dev_kfree_skb_any(skb);
  873. return NETDEV_TX_OK;
  874. }
  875. /* Add in the P_Key for multicast*/
  876. phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
  877. phdr->hwaddr[9] = priv->pkey & 0xff;
  878. neigh = ipoib_neigh_get(dev, phdr->hwaddr);
  879. if (likely(neigh))
  880. goto send_using_neigh;
  881. ipoib_mcast_send(dev, phdr->hwaddr, skb);
  882. return NETDEV_TX_OK;
  883. }
  884. /* unicast, arrange "switch" according to probability */
  885. switch (header->proto) {
  886. case htons(ETH_P_IP):
  887. case htons(ETH_P_IPV6):
  888. case htons(ETH_P_TIPC):
  889. neigh = ipoib_neigh_get(dev, phdr->hwaddr);
  890. if (unlikely(!neigh)) {
  891. neigh_add_path(skb, phdr->hwaddr, dev);
  892. return NETDEV_TX_OK;
  893. }
  894. break;
  895. case htons(ETH_P_ARP):
  896. case htons(ETH_P_RARP):
  897. /* for unicast ARP and RARP should always perform path find */
  898. unicast_arp_send(skb, dev, phdr);
  899. return NETDEV_TX_OK;
  900. default:
  901. /* ethertype not supported by IPoIB */
  902. ++dev->stats.tx_dropped;
  903. dev_kfree_skb_any(skb);
  904. return NETDEV_TX_OK;
  905. }
  906. send_using_neigh:
  907. /* note we now hold a ref to neigh */
  908. if (ipoib_cm_get(neigh)) {
  909. if (ipoib_cm_up(neigh)) {
  910. ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
  911. goto unref;
  912. }
  913. } else if (neigh->ah) {
  914. ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(phdr->hwaddr));
  915. goto unref;
  916. }
  917. if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
  918. push_pseudo_header(skb, phdr->hwaddr);
  919. spin_lock_irqsave(&priv->lock, flags);
  920. __skb_queue_tail(&neigh->queue, skb);
  921. spin_unlock_irqrestore(&priv->lock, flags);
  922. } else {
  923. ++dev->stats.tx_dropped;
  924. dev_kfree_skb_any(skb);
  925. }
  926. unref:
  927. ipoib_neigh_put(neigh);
  928. return NETDEV_TX_OK;
  929. }
  930. static void ipoib_timeout(struct net_device *dev)
  931. {
  932. struct ipoib_dev_priv *priv = netdev_priv(dev);
  933. ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
  934. jiffies_to_msecs(jiffies - dev_trans_start(dev)));
  935. ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
  936. netif_queue_stopped(dev),
  937. priv->tx_head, priv->tx_tail);
  938. /* XXX reset QP, etc. */
  939. }
  940. static int ipoib_hard_header(struct sk_buff *skb,
  941. struct net_device *dev,
  942. unsigned short type,
  943. const void *daddr, const void *saddr, unsigned len)
  944. {
  945. struct ipoib_header *header;
  946. header = (struct ipoib_header *) skb_push(skb, sizeof *header);
  947. header->proto = htons(type);
  948. header->reserved = 0;
  949. /*
  950. * we don't rely on dst_entry structure, always stuff the
  951. * destination address into skb hard header so we can figure out where
  952. * to send the packet later.
  953. */
  954. push_pseudo_header(skb, daddr);
  955. return IPOIB_HARD_LEN;
  956. }
  957. static void ipoib_set_mcast_list(struct net_device *dev)
  958. {
  959. struct ipoib_dev_priv *priv = netdev_priv(dev);
  960. if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
  961. ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
  962. return;
  963. }
  964. queue_work(priv->wq, &priv->restart_task);
  965. }
  966. static int ipoib_get_iflink(const struct net_device *dev)
  967. {
  968. struct ipoib_dev_priv *priv = netdev_priv(dev);
  969. /* parent interface */
  970. if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
  971. return dev->ifindex;
  972. /* child/vlan interface */
  973. return priv->parent->ifindex;
  974. }
  975. static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
  976. {
  977. /*
  978. * Use only the address parts that contributes to spreading
  979. * The subnet prefix is not used as one can not connect to
  980. * same remote port (GUID) using the same remote QPN via two
  981. * different subnets.
  982. */
  983. /* qpn octets[1:4) & port GUID octets[12:20) */
  984. u32 *d32 = (u32 *) daddr;
  985. u32 hv;
  986. hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
  987. return hv & htbl->mask;
  988. }
  989. struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
  990. {
  991. struct ipoib_dev_priv *priv = netdev_priv(dev);
  992. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  993. struct ipoib_neigh_hash *htbl;
  994. struct ipoib_neigh *neigh = NULL;
  995. u32 hash_val;
  996. rcu_read_lock_bh();
  997. htbl = rcu_dereference_bh(ntbl->htbl);
  998. if (!htbl)
  999. goto out_unlock;
  1000. hash_val = ipoib_addr_hash(htbl, daddr);
  1001. for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
  1002. neigh != NULL;
  1003. neigh = rcu_dereference_bh(neigh->hnext)) {
  1004. if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
  1005. /* found, take one ref on behalf of the caller */
  1006. if (!atomic_inc_not_zero(&neigh->refcnt)) {
  1007. /* deleted */
  1008. neigh = NULL;
  1009. goto out_unlock;
  1010. }
  1011. if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE))
  1012. neigh->alive = jiffies;
  1013. goto out_unlock;
  1014. }
  1015. }
  1016. out_unlock:
  1017. rcu_read_unlock_bh();
  1018. return neigh;
  1019. }
  1020. static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
  1021. {
  1022. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1023. struct ipoib_neigh_hash *htbl;
  1024. unsigned long neigh_obsolete;
  1025. unsigned long dt;
  1026. unsigned long flags;
  1027. int i;
  1028. LIST_HEAD(remove_list);
  1029. if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
  1030. return;
  1031. spin_lock_irqsave(&priv->lock, flags);
  1032. htbl = rcu_dereference_protected(ntbl->htbl,
  1033. lockdep_is_held(&priv->lock));
  1034. if (!htbl)
  1035. goto out_unlock;
  1036. /* neigh is obsolete if it was idle for two GC periods */
  1037. dt = 2 * arp_tbl.gc_interval;
  1038. neigh_obsolete = jiffies - dt;
  1039. /* handle possible race condition */
  1040. if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
  1041. goto out_unlock;
  1042. for (i = 0; i < htbl->size; i++) {
  1043. struct ipoib_neigh *neigh;
  1044. struct ipoib_neigh __rcu **np = &htbl->buckets[i];
  1045. while ((neigh = rcu_dereference_protected(*np,
  1046. lockdep_is_held(&priv->lock))) != NULL) {
  1047. /* was the neigh idle for two GC periods */
  1048. if (time_after(neigh_obsolete, neigh->alive)) {
  1049. ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list);
  1050. rcu_assign_pointer(*np,
  1051. rcu_dereference_protected(neigh->hnext,
  1052. lockdep_is_held(&priv->lock)));
  1053. /* remove from path/mc list */
  1054. list_del_init(&neigh->list);
  1055. call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
  1056. } else {
  1057. np = &neigh->hnext;
  1058. }
  1059. }
  1060. }
  1061. out_unlock:
  1062. spin_unlock_irqrestore(&priv->lock, flags);
  1063. ipoib_mcast_remove_list(&remove_list);
  1064. }
  1065. static void ipoib_reap_neigh(struct work_struct *work)
  1066. {
  1067. struct ipoib_dev_priv *priv =
  1068. container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
  1069. __ipoib_reap_neigh(priv);
  1070. if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
  1071. queue_delayed_work(priv->wq, &priv->neigh_reap_task,
  1072. arp_tbl.gc_interval);
  1073. }
  1074. static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
  1075. struct net_device *dev)
  1076. {
  1077. struct ipoib_neigh *neigh;
  1078. neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
  1079. if (!neigh)
  1080. return NULL;
  1081. neigh->dev = dev;
  1082. memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
  1083. skb_queue_head_init(&neigh->queue);
  1084. INIT_LIST_HEAD(&neigh->list);
  1085. ipoib_cm_set(neigh, NULL);
  1086. /* one ref on behalf of the caller */
  1087. atomic_set(&neigh->refcnt, 1);
  1088. return neigh;
  1089. }
  1090. struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
  1091. struct net_device *dev)
  1092. {
  1093. struct ipoib_dev_priv *priv = netdev_priv(dev);
  1094. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1095. struct ipoib_neigh_hash *htbl;
  1096. struct ipoib_neigh *neigh;
  1097. u32 hash_val;
  1098. htbl = rcu_dereference_protected(ntbl->htbl,
  1099. lockdep_is_held(&priv->lock));
  1100. if (!htbl) {
  1101. neigh = NULL;
  1102. goto out_unlock;
  1103. }
  1104. /* need to add a new neigh, but maybe some other thread succeeded?
  1105. * recalc hash, maybe hash resize took place so we do a search
  1106. */
  1107. hash_val = ipoib_addr_hash(htbl, daddr);
  1108. for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
  1109. lockdep_is_held(&priv->lock));
  1110. neigh != NULL;
  1111. neigh = rcu_dereference_protected(neigh->hnext,
  1112. lockdep_is_held(&priv->lock))) {
  1113. if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
  1114. /* found, take one ref on behalf of the caller */
  1115. if (!atomic_inc_not_zero(&neigh->refcnt)) {
  1116. /* deleted */
  1117. neigh = NULL;
  1118. break;
  1119. }
  1120. neigh->alive = jiffies;
  1121. goto out_unlock;
  1122. }
  1123. }
  1124. neigh = ipoib_neigh_ctor(daddr, dev);
  1125. if (!neigh)
  1126. goto out_unlock;
  1127. /* one ref on behalf of the hash table */
  1128. atomic_inc(&neigh->refcnt);
  1129. neigh->alive = jiffies;
  1130. /* put in hash */
  1131. rcu_assign_pointer(neigh->hnext,
  1132. rcu_dereference_protected(htbl->buckets[hash_val],
  1133. lockdep_is_held(&priv->lock)));
  1134. rcu_assign_pointer(htbl->buckets[hash_val], neigh);
  1135. atomic_inc(&ntbl->entries);
  1136. out_unlock:
  1137. return neigh;
  1138. }
  1139. void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
  1140. {
  1141. /* neigh reference count was dropprd to zero */
  1142. struct net_device *dev = neigh->dev;
  1143. struct ipoib_dev_priv *priv = netdev_priv(dev);
  1144. struct sk_buff *skb;
  1145. if (neigh->ah)
  1146. ipoib_put_ah(neigh->ah);
  1147. while ((skb = __skb_dequeue(&neigh->queue))) {
  1148. ++dev->stats.tx_dropped;
  1149. dev_kfree_skb_any(skb);
  1150. }
  1151. if (ipoib_cm_get(neigh))
  1152. ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
  1153. ipoib_dbg(netdev_priv(dev),
  1154. "neigh free for %06x %pI6\n",
  1155. IPOIB_QPN(neigh->daddr),
  1156. neigh->daddr + 4);
  1157. kfree(neigh);
  1158. if (atomic_dec_and_test(&priv->ntbl.entries)) {
  1159. if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
  1160. complete(&priv->ntbl.flushed);
  1161. }
  1162. }
  1163. static void ipoib_neigh_reclaim(struct rcu_head *rp)
  1164. {
  1165. /* Called as a result of removal from hash table */
  1166. struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
  1167. /* note TX context may hold another ref */
  1168. ipoib_neigh_put(neigh);
  1169. }
  1170. void ipoib_neigh_free(struct ipoib_neigh *neigh)
  1171. {
  1172. struct net_device *dev = neigh->dev;
  1173. struct ipoib_dev_priv *priv = netdev_priv(dev);
  1174. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1175. struct ipoib_neigh_hash *htbl;
  1176. struct ipoib_neigh __rcu **np;
  1177. struct ipoib_neigh *n;
  1178. u32 hash_val;
  1179. htbl = rcu_dereference_protected(ntbl->htbl,
  1180. lockdep_is_held(&priv->lock));
  1181. if (!htbl)
  1182. return;
  1183. hash_val = ipoib_addr_hash(htbl, neigh->daddr);
  1184. np = &htbl->buckets[hash_val];
  1185. for (n = rcu_dereference_protected(*np,
  1186. lockdep_is_held(&priv->lock));
  1187. n != NULL;
  1188. n = rcu_dereference_protected(*np,
  1189. lockdep_is_held(&priv->lock))) {
  1190. if (n == neigh) {
  1191. /* found */
  1192. rcu_assign_pointer(*np,
  1193. rcu_dereference_protected(neigh->hnext,
  1194. lockdep_is_held(&priv->lock)));
  1195. /* remove from parent list */
  1196. list_del_init(&neigh->list);
  1197. call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
  1198. return;
  1199. } else {
  1200. np = &n->hnext;
  1201. }
  1202. }
  1203. }
  1204. static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
  1205. {
  1206. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1207. struct ipoib_neigh_hash *htbl;
  1208. struct ipoib_neigh __rcu **buckets;
  1209. u32 size;
  1210. clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
  1211. ntbl->htbl = NULL;
  1212. htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
  1213. if (!htbl)
  1214. return -ENOMEM;
  1215. set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
  1216. size = roundup_pow_of_two(arp_tbl.gc_thresh3);
  1217. buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
  1218. if (!buckets) {
  1219. kfree(htbl);
  1220. return -ENOMEM;
  1221. }
  1222. htbl->size = size;
  1223. htbl->mask = (size - 1);
  1224. htbl->buckets = buckets;
  1225. RCU_INIT_POINTER(ntbl->htbl, htbl);
  1226. htbl->ntbl = ntbl;
  1227. atomic_set(&ntbl->entries, 0);
  1228. /* start garbage collection */
  1229. clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
  1230. queue_delayed_work(priv->wq, &priv->neigh_reap_task,
  1231. arp_tbl.gc_interval);
  1232. return 0;
  1233. }
  1234. static void neigh_hash_free_rcu(struct rcu_head *head)
  1235. {
  1236. struct ipoib_neigh_hash *htbl = container_of(head,
  1237. struct ipoib_neigh_hash,
  1238. rcu);
  1239. struct ipoib_neigh __rcu **buckets = htbl->buckets;
  1240. struct ipoib_neigh_table *ntbl = htbl->ntbl;
  1241. kfree(buckets);
  1242. kfree(htbl);
  1243. complete(&ntbl->deleted);
  1244. }
  1245. void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
  1246. {
  1247. struct ipoib_dev_priv *priv = netdev_priv(dev);
  1248. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1249. struct ipoib_neigh_hash *htbl;
  1250. unsigned long flags;
  1251. int i;
  1252. /* remove all neigh connected to a given path or mcast */
  1253. spin_lock_irqsave(&priv->lock, flags);
  1254. htbl = rcu_dereference_protected(ntbl->htbl,
  1255. lockdep_is_held(&priv->lock));
  1256. if (!htbl)
  1257. goto out_unlock;
  1258. for (i = 0; i < htbl->size; i++) {
  1259. struct ipoib_neigh *neigh;
  1260. struct ipoib_neigh __rcu **np = &htbl->buckets[i];
  1261. while ((neigh = rcu_dereference_protected(*np,
  1262. lockdep_is_held(&priv->lock))) != NULL) {
  1263. /* delete neighs belong to this parent */
  1264. if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
  1265. rcu_assign_pointer(*np,
  1266. rcu_dereference_protected(neigh->hnext,
  1267. lockdep_is_held(&priv->lock)));
  1268. /* remove from parent list */
  1269. list_del_init(&neigh->list);
  1270. call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
  1271. } else {
  1272. np = &neigh->hnext;
  1273. }
  1274. }
  1275. }
  1276. out_unlock:
  1277. spin_unlock_irqrestore(&priv->lock, flags);
  1278. }
  1279. static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
  1280. {
  1281. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1282. struct ipoib_neigh_hash *htbl;
  1283. unsigned long flags;
  1284. int i, wait_flushed = 0;
  1285. init_completion(&priv->ntbl.flushed);
  1286. spin_lock_irqsave(&priv->lock, flags);
  1287. htbl = rcu_dereference_protected(ntbl->htbl,
  1288. lockdep_is_held(&priv->lock));
  1289. if (!htbl)
  1290. goto out_unlock;
  1291. wait_flushed = atomic_read(&priv->ntbl.entries);
  1292. if (!wait_flushed)
  1293. goto free_htbl;
  1294. for (i = 0; i < htbl->size; i++) {
  1295. struct ipoib_neigh *neigh;
  1296. struct ipoib_neigh __rcu **np = &htbl->buckets[i];
  1297. while ((neigh = rcu_dereference_protected(*np,
  1298. lockdep_is_held(&priv->lock))) != NULL) {
  1299. rcu_assign_pointer(*np,
  1300. rcu_dereference_protected(neigh->hnext,
  1301. lockdep_is_held(&priv->lock)));
  1302. /* remove from path/mc list */
  1303. list_del_init(&neigh->list);
  1304. call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
  1305. }
  1306. }
  1307. free_htbl:
  1308. rcu_assign_pointer(ntbl->htbl, NULL);
  1309. call_rcu(&htbl->rcu, neigh_hash_free_rcu);
  1310. out_unlock:
  1311. spin_unlock_irqrestore(&priv->lock, flags);
  1312. if (wait_flushed)
  1313. wait_for_completion(&priv->ntbl.flushed);
  1314. }
  1315. static void ipoib_neigh_hash_uninit(struct net_device *dev)
  1316. {
  1317. struct ipoib_dev_priv *priv = netdev_priv(dev);
  1318. int stopped;
  1319. ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
  1320. init_completion(&priv->ntbl.deleted);
  1321. set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
  1322. /* Stop GC if called at init fail need to cancel work */
  1323. stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
  1324. if (!stopped)
  1325. cancel_delayed_work(&priv->neigh_reap_task);
  1326. ipoib_flush_neighs(priv);
  1327. wait_for_completion(&priv->ntbl.deleted);
  1328. }
  1329. int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
  1330. {
  1331. struct ipoib_dev_priv *priv = netdev_priv(dev);
  1332. /* Allocate RX/TX "rings" to hold queued skbs */
  1333. priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
  1334. GFP_KERNEL);
  1335. if (!priv->rx_ring)
  1336. goto out;
  1337. priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
  1338. if (!priv->tx_ring) {
  1339. printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
  1340. ca->name, ipoib_sendq_size);
  1341. goto out_rx_ring_cleanup;
  1342. }
  1343. /* priv->tx_head, tx_tail & tx_outstanding are already 0 */
  1344. if (ipoib_ib_dev_init(dev, ca, port))
  1345. goto out_tx_ring_cleanup;
  1346. /*
  1347. * Must be after ipoib_ib_dev_init so we can allocate a per
  1348. * device wq there and use it here
  1349. */
  1350. if (ipoib_neigh_hash_init(priv) < 0)
  1351. goto out_dev_uninit;
  1352. return 0;
  1353. out_dev_uninit:
  1354. ipoib_ib_dev_cleanup(dev);
  1355. out_tx_ring_cleanup:
  1356. vfree(priv->tx_ring);
  1357. out_rx_ring_cleanup:
  1358. kfree(priv->rx_ring);
  1359. out:
  1360. return -ENOMEM;
  1361. }
  1362. void ipoib_dev_cleanup(struct net_device *dev)
  1363. {
  1364. struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
  1365. LIST_HEAD(head);
  1366. ASSERT_RTNL();
  1367. ipoib_delete_debug_files(dev);
  1368. /* Delete any child interfaces first */
  1369. list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
  1370. /* Stop GC on child */
  1371. set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
  1372. cancel_delayed_work(&cpriv->neigh_reap_task);
  1373. unregister_netdevice_queue(cpriv->dev, &head);
  1374. }
  1375. unregister_netdevice_many(&head);
  1376. /*
  1377. * Must be before ipoib_ib_dev_cleanup or we delete an in use
  1378. * work queue
  1379. */
  1380. ipoib_neigh_hash_uninit(dev);
  1381. ipoib_ib_dev_cleanup(dev);
  1382. kfree(priv->rx_ring);
  1383. vfree(priv->tx_ring);
  1384. priv->rx_ring = NULL;
  1385. priv->tx_ring = NULL;
  1386. }
  1387. static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state)
  1388. {
  1389. struct ipoib_dev_priv *priv = netdev_priv(dev);
  1390. return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state);
  1391. }
  1392. static int ipoib_get_vf_config(struct net_device *dev, int vf,
  1393. struct ifla_vf_info *ivf)
  1394. {
  1395. struct ipoib_dev_priv *priv = netdev_priv(dev);
  1396. int err;
  1397. err = ib_get_vf_config(priv->ca, vf, priv->port, ivf);
  1398. if (err)
  1399. return err;
  1400. ivf->vf = vf;
  1401. return 0;
  1402. }
  1403. static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type)
  1404. {
  1405. struct ipoib_dev_priv *priv = netdev_priv(dev);
  1406. if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID)
  1407. return -EINVAL;
  1408. return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type);
  1409. }
  1410. static int ipoib_get_vf_stats(struct net_device *dev, int vf,
  1411. struct ifla_vf_stats *vf_stats)
  1412. {
  1413. struct ipoib_dev_priv *priv = netdev_priv(dev);
  1414. return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats);
  1415. }
  1416. static const struct header_ops ipoib_header_ops = {
  1417. .create = ipoib_hard_header,
  1418. };
  1419. static const struct net_device_ops ipoib_netdev_ops_pf = {
  1420. .ndo_uninit = ipoib_uninit,
  1421. .ndo_open = ipoib_open,
  1422. .ndo_stop = ipoib_stop,
  1423. .ndo_change_mtu = ipoib_change_mtu,
  1424. .ndo_fix_features = ipoib_fix_features,
  1425. .ndo_start_xmit = ipoib_start_xmit,
  1426. .ndo_tx_timeout = ipoib_timeout,
  1427. .ndo_set_rx_mode = ipoib_set_mcast_list,
  1428. .ndo_get_iflink = ipoib_get_iflink,
  1429. .ndo_set_vf_link_state = ipoib_set_vf_link_state,
  1430. .ndo_get_vf_config = ipoib_get_vf_config,
  1431. .ndo_get_vf_stats = ipoib_get_vf_stats,
  1432. .ndo_set_vf_guid = ipoib_set_vf_guid,
  1433. .ndo_set_mac_address = ipoib_set_mac,
  1434. };
  1435. static const struct net_device_ops ipoib_netdev_ops_vf = {
  1436. .ndo_uninit = ipoib_uninit,
  1437. .ndo_open = ipoib_open,
  1438. .ndo_stop = ipoib_stop,
  1439. .ndo_change_mtu = ipoib_change_mtu,
  1440. .ndo_fix_features = ipoib_fix_features,
  1441. .ndo_start_xmit = ipoib_start_xmit,
  1442. .ndo_tx_timeout = ipoib_timeout,
  1443. .ndo_set_rx_mode = ipoib_set_mcast_list,
  1444. .ndo_get_iflink = ipoib_get_iflink,
  1445. };
  1446. void ipoib_setup(struct net_device *dev)
  1447. {
  1448. struct ipoib_dev_priv *priv = netdev_priv(dev);
  1449. if (priv->hca_caps & IB_DEVICE_VIRTUAL_FUNCTION)
  1450. dev->netdev_ops = &ipoib_netdev_ops_vf;
  1451. else
  1452. dev->netdev_ops = &ipoib_netdev_ops_pf;
  1453. dev->header_ops = &ipoib_header_ops;
  1454. ipoib_set_ethtool_ops(dev);
  1455. netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT);
  1456. dev->watchdog_timeo = HZ;
  1457. dev->flags |= IFF_BROADCAST | IFF_MULTICAST;
  1458. dev->hard_header_len = IPOIB_HARD_LEN;
  1459. dev->addr_len = INFINIBAND_ALEN;
  1460. dev->type = ARPHRD_INFINIBAND;
  1461. dev->tx_queue_len = ipoib_sendq_size * 2;
  1462. dev->features = (NETIF_F_VLAN_CHALLENGED |
  1463. NETIF_F_HIGHDMA);
  1464. netif_keep_dst(dev);
  1465. memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
  1466. priv->dev = dev;
  1467. spin_lock_init(&priv->lock);
  1468. init_rwsem(&priv->vlan_rwsem);
  1469. INIT_LIST_HEAD(&priv->path_list);
  1470. INIT_LIST_HEAD(&priv->child_intfs);
  1471. INIT_LIST_HEAD(&priv->dead_ahs);
  1472. INIT_LIST_HEAD(&priv->multicast_list);
  1473. INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);
  1474. INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
  1475. INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light);
  1476. INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal);
  1477. INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);
  1478. INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
  1479. INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
  1480. INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
  1481. }
  1482. struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
  1483. {
  1484. struct net_device *dev;
  1485. dev = alloc_netdev((int)sizeof(struct ipoib_dev_priv), name,
  1486. NET_NAME_UNKNOWN, ipoib_setup);
  1487. if (!dev)
  1488. return NULL;
  1489. return netdev_priv(dev);
  1490. }
  1491. static ssize_t show_pkey(struct device *dev,
  1492. struct device_attribute *attr, char *buf)
  1493. {
  1494. struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
  1495. return sprintf(buf, "0x%04x\n", priv->pkey);
  1496. }
  1497. static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
  1498. static ssize_t show_umcast(struct device *dev,
  1499. struct device_attribute *attr, char *buf)
  1500. {
  1501. struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
  1502. return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
  1503. }
  1504. void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
  1505. {
  1506. struct ipoib_dev_priv *priv = netdev_priv(ndev);
  1507. if (umcast_val > 0) {
  1508. set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
  1509. ipoib_warn(priv, "ignoring multicast groups joined directly "
  1510. "by userspace\n");
  1511. } else
  1512. clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
  1513. }
  1514. static ssize_t set_umcast(struct device *dev,
  1515. struct device_attribute *attr,
  1516. const char *buf, size_t count)
  1517. {
  1518. unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
  1519. ipoib_set_umcast(to_net_dev(dev), umcast_val);
  1520. return count;
  1521. }
  1522. static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
  1523. int ipoib_add_umcast_attr(struct net_device *dev)
  1524. {
  1525. return device_create_file(&dev->dev, &dev_attr_umcast);
  1526. }
  1527. static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid)
  1528. {
  1529. struct ipoib_dev_priv *child_priv;
  1530. struct net_device *netdev = priv->dev;
  1531. netif_addr_lock_bh(netdev);
  1532. memcpy(&priv->local_gid.global.interface_id,
  1533. &gid->global.interface_id,
  1534. sizeof(gid->global.interface_id));
  1535. memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid));
  1536. clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
  1537. netif_addr_unlock_bh(netdev);
  1538. if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
  1539. down_read(&priv->vlan_rwsem);
  1540. list_for_each_entry(child_priv, &priv->child_intfs, list)
  1541. set_base_guid(child_priv, gid);
  1542. up_read(&priv->vlan_rwsem);
  1543. }
  1544. }
  1545. static int ipoib_check_lladdr(struct net_device *dev,
  1546. struct sockaddr_storage *ss)
  1547. {
  1548. union ib_gid *gid = (union ib_gid *)(ss->__data + 4);
  1549. int ret = 0;
  1550. netif_addr_lock_bh(dev);
  1551. /* Make sure the QPN, reserved and subnet prefix match the current
  1552. * lladdr, it also makes sure the lladdr is unicast.
  1553. */
  1554. if (memcmp(dev->dev_addr, ss->__data,
  1555. 4 + sizeof(gid->global.subnet_prefix)) ||
  1556. gid->global.interface_id == 0)
  1557. ret = -EINVAL;
  1558. netif_addr_unlock_bh(dev);
  1559. return ret;
  1560. }
  1561. static int ipoib_set_mac(struct net_device *dev, void *addr)
  1562. {
  1563. struct ipoib_dev_priv *priv = netdev_priv(dev);
  1564. struct sockaddr_storage *ss = addr;
  1565. int ret;
  1566. if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
  1567. return -EBUSY;
  1568. ret = ipoib_check_lladdr(dev, ss);
  1569. if (ret)
  1570. return ret;
  1571. set_base_guid(priv, (union ib_gid *)(ss->__data + 4));
  1572. queue_work(ipoib_workqueue, &priv->flush_light);
  1573. return 0;
  1574. }
  1575. static ssize_t create_child(struct device *dev,
  1576. struct device_attribute *attr,
  1577. const char *buf, size_t count)
  1578. {
  1579. int pkey;
  1580. int ret;
  1581. if (sscanf(buf, "%i", &pkey) != 1)
  1582. return -EINVAL;
  1583. if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
  1584. return -EINVAL;
  1585. /*
  1586. * Set the full membership bit, so that we join the right
  1587. * broadcast group, etc.
  1588. */
  1589. pkey |= 0x8000;
  1590. ret = ipoib_vlan_add(to_net_dev(dev), pkey);
  1591. return ret ? ret : count;
  1592. }
  1593. static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
  1594. static ssize_t delete_child(struct device *dev,
  1595. struct device_attribute *attr,
  1596. const char *buf, size_t count)
  1597. {
  1598. int pkey;
  1599. int ret;
  1600. if (sscanf(buf, "%i", &pkey) != 1)
  1601. return -EINVAL;
  1602. if (pkey < 0 || pkey > 0xffff)
  1603. return -EINVAL;
  1604. ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
  1605. return ret ? ret : count;
  1606. }
  1607. static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
  1608. int ipoib_add_pkey_attr(struct net_device *dev)
  1609. {
  1610. return device_create_file(&dev->dev, &dev_attr_pkey);
  1611. }
  1612. void ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
  1613. {
  1614. priv->hca_caps = hca->attrs.device_cap_flags;
  1615. if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
  1616. priv->dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
  1617. if (priv->hca_caps & IB_DEVICE_UD_TSO)
  1618. priv->dev->hw_features |= NETIF_F_TSO;
  1619. priv->dev->features |= priv->dev->hw_features;
  1620. }
  1621. }
  1622. static struct net_device *ipoib_add_port(const char *format,
  1623. struct ib_device *hca, u8 port)
  1624. {
  1625. struct ipoib_dev_priv *priv;
  1626. struct ib_port_attr attr;
  1627. int result = -ENOMEM;
  1628. priv = ipoib_intf_alloc(format);
  1629. if (!priv)
  1630. goto alloc_mem_failed;
  1631. SET_NETDEV_DEV(priv->dev, hca->dev.parent);
  1632. priv->dev->dev_id = port - 1;
  1633. result = ib_query_port(hca, port, &attr);
  1634. if (!result)
  1635. priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
  1636. else {
  1637. printk(KERN_WARNING "%s: ib_query_port %d failed\n",
  1638. hca->name, port);
  1639. goto device_init_failed;
  1640. }
  1641. /* MTU will be reset when mcast join happens */
  1642. priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
  1643. priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu;
  1644. priv->dev->max_mtu = IPOIB_CM_MTU;
  1645. priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
  1646. result = ib_query_pkey(hca, port, 0, &priv->pkey);
  1647. if (result) {
  1648. printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
  1649. hca->name, port, result);
  1650. goto device_init_failed;
  1651. }
  1652. ipoib_set_dev_features(priv, hca);
  1653. /*
  1654. * Set the full membership bit, so that we join the right
  1655. * broadcast group, etc.
  1656. */
  1657. priv->pkey |= 0x8000;
  1658. priv->dev->broadcast[8] = priv->pkey >> 8;
  1659. priv->dev->broadcast[9] = priv->pkey & 0xff;
  1660. result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL);
  1661. if (result) {
  1662. printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
  1663. hca->name, port, result);
  1664. goto device_init_failed;
  1665. } else
  1666. memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
  1667. set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
  1668. result = ipoib_dev_init(priv->dev, hca, port);
  1669. if (result < 0) {
  1670. printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
  1671. hca->name, port, result);
  1672. goto device_init_failed;
  1673. }
  1674. INIT_IB_EVENT_HANDLER(&priv->event_handler,
  1675. priv->ca, ipoib_event);
  1676. result = ib_register_event_handler(&priv->event_handler);
  1677. if (result < 0) {
  1678. printk(KERN_WARNING "%s: ib_register_event_handler failed for "
  1679. "port %d (ret = %d)\n",
  1680. hca->name, port, result);
  1681. goto event_failed;
  1682. }
  1683. result = register_netdev(priv->dev);
  1684. if (result) {
  1685. printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
  1686. hca->name, port, result);
  1687. goto register_failed;
  1688. }
  1689. ipoib_create_debug_files(priv->dev);
  1690. if (ipoib_cm_add_mode_attr(priv->dev))
  1691. goto sysfs_failed;
  1692. if (ipoib_add_pkey_attr(priv->dev))
  1693. goto sysfs_failed;
  1694. if (ipoib_add_umcast_attr(priv->dev))
  1695. goto sysfs_failed;
  1696. if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
  1697. goto sysfs_failed;
  1698. if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
  1699. goto sysfs_failed;
  1700. return priv->dev;
  1701. sysfs_failed:
  1702. ipoib_delete_debug_files(priv->dev);
  1703. unregister_netdev(priv->dev);
  1704. register_failed:
  1705. ib_unregister_event_handler(&priv->event_handler);
  1706. flush_workqueue(ipoib_workqueue);
  1707. /* Stop GC if started before flush */
  1708. set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
  1709. cancel_delayed_work(&priv->neigh_reap_task);
  1710. flush_workqueue(priv->wq);
  1711. event_failed:
  1712. ipoib_dev_cleanup(priv->dev);
  1713. device_init_failed:
  1714. free_netdev(priv->dev);
  1715. alloc_mem_failed:
  1716. return ERR_PTR(result);
  1717. }
  1718. static void ipoib_add_one(struct ib_device *device)
  1719. {
  1720. struct list_head *dev_list;
  1721. struct net_device *dev;
  1722. struct ipoib_dev_priv *priv;
  1723. int p;
  1724. int count = 0;
  1725. dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
  1726. if (!dev_list)
  1727. return;
  1728. INIT_LIST_HEAD(dev_list);
  1729. for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
  1730. if (!rdma_protocol_ib(device, p))
  1731. continue;
  1732. dev = ipoib_add_port("ib%d", device, p);
  1733. if (!IS_ERR(dev)) {
  1734. priv = netdev_priv(dev);
  1735. list_add_tail(&priv->list, dev_list);
  1736. count++;
  1737. }
  1738. }
  1739. if (!count) {
  1740. kfree(dev_list);
  1741. return;
  1742. }
  1743. ib_set_client_data(device, &ipoib_client, dev_list);
  1744. }
  1745. static void ipoib_remove_one(struct ib_device *device, void *client_data)
  1746. {
  1747. struct ipoib_dev_priv *priv, *tmp;
  1748. struct list_head *dev_list = client_data;
  1749. if (!dev_list)
  1750. return;
  1751. list_for_each_entry_safe(priv, tmp, dev_list, list) {
  1752. ib_unregister_event_handler(&priv->event_handler);
  1753. flush_workqueue(ipoib_workqueue);
  1754. /* mark interface in the middle of destruction */
  1755. set_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags);
  1756. rtnl_lock();
  1757. dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
  1758. rtnl_unlock();
  1759. /* Stop GC */
  1760. set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
  1761. cancel_delayed_work(&priv->neigh_reap_task);
  1762. flush_workqueue(priv->wq);
  1763. unregister_netdev(priv->dev);
  1764. free_netdev(priv->dev);
  1765. }
  1766. kfree(dev_list);
  1767. }
  1768. static int __init ipoib_init_module(void)
  1769. {
  1770. int ret;
  1771. ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
  1772. ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
  1773. ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
  1774. ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
  1775. ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
  1776. ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
  1777. #ifdef CONFIG_INFINIBAND_IPOIB_CM
  1778. ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
  1779. #endif
  1780. /*
  1781. * When copying small received packets, we only copy from the
  1782. * linear data part of the SKB, so we rely on this condition.
  1783. */
  1784. BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
  1785. ret = ipoib_register_debugfs();
  1786. if (ret)
  1787. return ret;
  1788. /*
  1789. * We create a global workqueue here that is used for all flush
  1790. * operations. However, if you attempt to flush a workqueue
  1791. * from a task on that same workqueue, it deadlocks the system.
  1792. * We want to be able to flush the tasks associated with a
  1793. * specific net device, so we also create a workqueue for each
  1794. * netdevice. We queue up the tasks for that device only on
  1795. * its private workqueue, and we only queue up flush events
  1796. * on our global flush workqueue. This avoids the deadlocks.
  1797. */
  1798. ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush",
  1799. WQ_MEM_RECLAIM);
  1800. if (!ipoib_workqueue) {
  1801. ret = -ENOMEM;
  1802. goto err_fs;
  1803. }
  1804. ib_sa_register_client(&ipoib_sa_client);
  1805. ret = ib_register_client(&ipoib_client);
  1806. if (ret)
  1807. goto err_sa;
  1808. ret = ipoib_netlink_init();
  1809. if (ret)
  1810. goto err_client;
  1811. return 0;
  1812. err_client:
  1813. ib_unregister_client(&ipoib_client);
  1814. err_sa:
  1815. ib_sa_unregister_client(&ipoib_sa_client);
  1816. destroy_workqueue(ipoib_workqueue);
  1817. err_fs:
  1818. ipoib_unregister_debugfs();
  1819. return ret;
  1820. }
  1821. static void __exit ipoib_cleanup_module(void)
  1822. {
  1823. ipoib_netlink_fini();
  1824. ib_unregister_client(&ipoib_client);
  1825. ib_sa_unregister_client(&ipoib_sa_client);
  1826. ipoib_unregister_debugfs();
  1827. destroy_workqueue(ipoib_workqueue);
  1828. }
  1829. module_init(ipoib_init_module);
  1830. module_exit(ipoib_cleanup_module);