bte_error.c 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. /*
  2. * This file is subject to the terms and conditions of the GNU General Public
  3. * License. See the file "COPYING" in the main directory of this archive
  4. * for more details.
  5. *
  6. * Copyright (c) 2000-2007 Silicon Graphics, Inc. All Rights Reserved.
  7. */
  8. #include <linux/types.h>
  9. #include <asm/sn/sn_sal.h>
  10. #include "ioerror.h"
  11. #include <asm/sn/addrs.h>
  12. #include <asm/sn/shubio.h>
  13. #include <asm/sn/geo.h>
  14. #include "xtalk/xwidgetdev.h"
  15. #include "xtalk/hubdev.h"
  16. #include <asm/sn/bte.h>
  17. #include <asm/param.h>
  18. /*
  19. * Bte error handling is done in two parts. The first captures
  20. * any crb related errors. Since there can be multiple crbs per
  21. * interface and multiple interfaces active, we need to wait until
  22. * all active crbs are completed. This is the first job of the
  23. * second part error handler. When all bte related CRBs are cleanly
  24. * completed, it resets the interfaces and gets them ready for new
  25. * transfers to be queued.
  26. */
  27. /*
  28. * Wait until all BTE related CRBs are completed
  29. * and then reset the interfaces.
  30. */
  31. static int shub1_bte_error_handler(struct nodepda_s *err_nodepda)
  32. {
  33. struct timer_list *recovery_timer = &err_nodepda->bte_recovery_timer;
  34. nasid_t nasid;
  35. int i;
  36. int valid_crbs;
  37. ii_imem_u_t imem; /* II IMEM Register */
  38. ii_icrb0_d_u_t icrbd; /* II CRB Register D */
  39. ii_ibcr_u_t ibcr;
  40. ii_icmr_u_t icmr;
  41. ii_ieclr_u_t ieclr;
  42. BTE_PRINTK(("shub1_bte_error_handler(%p) - %d\n", err_nodepda,
  43. smp_processor_id()));
  44. if ((err_nodepda->bte_if[0].bh_error == BTE_SUCCESS) &&
  45. (err_nodepda->bte_if[1].bh_error == BTE_SUCCESS)) {
  46. BTE_PRINTK(("eh:%p:%d Nothing to do.\n", err_nodepda,
  47. smp_processor_id()));
  48. return 1;
  49. }
  50. /* Determine information about our hub */
  51. nasid = cnodeid_to_nasid(err_nodepda->bte_if[0].bte_cnode);
  52. /*
  53. * A BTE transfer can use multiple CRBs. We need to make sure
  54. * that all the BTE CRBs are complete (or timed out) before
  55. * attempting to clean up the error. Resetting the BTE while
  56. * there are still BTE CRBs active will hang the BTE.
  57. * We should look at all the CRBs to see if they are allocated
  58. * to the BTE and see if they are still active. When none
  59. * are active, we can continue with the cleanup.
  60. *
  61. * We also want to make sure that the local NI port is up.
  62. * When a router resets the NI port can go down, while it
  63. * goes through the LLP handshake, but then comes back up.
  64. */
  65. icmr.ii_icmr_regval = REMOTE_HUB_L(nasid, IIO_ICMR);
  66. if (icmr.ii_icmr_fld_s.i_crb_mark != 0) {
  67. /*
  68. * There are errors which still need to be cleaned up by
  69. * hubiio_crb_error_handler
  70. */
  71. mod_timer(recovery_timer, jiffies + (HZ * 5));
  72. BTE_PRINTK(("eh:%p:%d Marked Giving up\n", err_nodepda,
  73. smp_processor_id()));
  74. return 1;
  75. }
  76. if (icmr.ii_icmr_fld_s.i_crb_vld != 0) {
  77. valid_crbs = icmr.ii_icmr_fld_s.i_crb_vld;
  78. for (i = 0; i < IIO_NUM_CRBS; i++) {
  79. if (!((1 << i) & valid_crbs)) {
  80. /* This crb was not marked as valid, ignore */
  81. continue;
  82. }
  83. icrbd.ii_icrb0_d_regval =
  84. REMOTE_HUB_L(nasid, IIO_ICRB_D(i));
  85. if (icrbd.d_bteop) {
  86. mod_timer(recovery_timer, jiffies + (HZ * 5));
  87. BTE_PRINTK(("eh:%p:%d Valid %d, Giving up\n",
  88. err_nodepda, smp_processor_id(),
  89. i));
  90. return 1;
  91. }
  92. }
  93. }
  94. BTE_PRINTK(("eh:%p:%d Cleaning up\n", err_nodepda, smp_processor_id()));
  95. /* Re-enable both bte interfaces */
  96. imem.ii_imem_regval = REMOTE_HUB_L(nasid, IIO_IMEM);
  97. imem.ii_imem_fld_s.i_b0_esd = imem.ii_imem_fld_s.i_b1_esd = 1;
  98. REMOTE_HUB_S(nasid, IIO_IMEM, imem.ii_imem_regval);
  99. /* Clear BTE0/1 error bits */
  100. ieclr.ii_ieclr_regval = 0;
  101. if (err_nodepda->bte_if[0].bh_error != BTE_SUCCESS)
  102. ieclr.ii_ieclr_fld_s.i_e_bte_0 = 1;
  103. if (err_nodepda->bte_if[1].bh_error != BTE_SUCCESS)
  104. ieclr.ii_ieclr_fld_s.i_e_bte_1 = 1;
  105. REMOTE_HUB_S(nasid, IIO_IECLR, ieclr.ii_ieclr_regval);
  106. /* Reinitialize both BTE state machines. */
  107. ibcr.ii_ibcr_regval = REMOTE_HUB_L(nasid, IIO_IBCR);
  108. ibcr.ii_ibcr_fld_s.i_soft_reset = 1;
  109. REMOTE_HUB_S(nasid, IIO_IBCR, ibcr.ii_ibcr_regval);
  110. del_timer(recovery_timer);
  111. return 0;
  112. }
  113. /*
  114. * Wait until all BTE related CRBs are completed
  115. * and then reset the interfaces.
  116. */
  117. static int shub2_bte_error_handler(struct nodepda_s *err_nodepda)
  118. {
  119. struct timer_list *recovery_timer = &err_nodepda->bte_recovery_timer;
  120. struct bteinfo_s *bte;
  121. nasid_t nasid;
  122. u64 status;
  123. int i;
  124. nasid = cnodeid_to_nasid(err_nodepda->bte_if[0].bte_cnode);
  125. /*
  126. * Verify that all the BTEs are complete
  127. */
  128. for (i = 0; i < BTES_PER_NODE; i++) {
  129. bte = &err_nodepda->bte_if[i];
  130. status = BTE_LNSTAT_LOAD(bte);
  131. if (status & IBLS_ERROR) {
  132. bte->bh_error = BTE_SHUB2_ERROR(status);
  133. continue;
  134. }
  135. if (!(status & IBLS_BUSY))
  136. continue;
  137. mod_timer(recovery_timer, jiffies + (HZ * 5));
  138. BTE_PRINTK(("eh:%p:%d Marked Giving up\n", err_nodepda,
  139. smp_processor_id()));
  140. return 1;
  141. }
  142. if (ia64_sn_bte_recovery(nasid))
  143. panic("bte_error_handler(): Fatal BTE Error");
  144. del_timer(recovery_timer);
  145. return 0;
  146. }
  147. /*
  148. * Wait until all BTE related CRBs are completed
  149. * and then reset the interfaces.
  150. */
  151. void bte_error_handler(struct nodepda_s *err_nodepda)
  152. {
  153. spinlock_t *recovery_lock = &err_nodepda->bte_recovery_lock;
  154. int i;
  155. unsigned long irq_flags;
  156. volatile u64 *notify;
  157. bte_result_t bh_error;
  158. BTE_PRINTK(("bte_error_handler(%p) - %d\n", err_nodepda,
  159. smp_processor_id()));
  160. spin_lock_irqsave(recovery_lock, irq_flags);
  161. /*
  162. * Lock all interfaces on this node to prevent new transfers
  163. * from being queued.
  164. */
  165. for (i = 0; i < BTES_PER_NODE; i++) {
  166. if (err_nodepda->bte_if[i].cleanup_active) {
  167. continue;
  168. }
  169. spin_lock(&err_nodepda->bte_if[i].spinlock);
  170. BTE_PRINTK(("eh:%p:%d locked %d\n", err_nodepda,
  171. smp_processor_id(), i));
  172. err_nodepda->bte_if[i].cleanup_active = 1;
  173. }
  174. if (is_shub1()) {
  175. if (shub1_bte_error_handler(err_nodepda)) {
  176. spin_unlock_irqrestore(recovery_lock, irq_flags);
  177. return;
  178. }
  179. } else {
  180. if (shub2_bte_error_handler(err_nodepda)) {
  181. spin_unlock_irqrestore(recovery_lock, irq_flags);
  182. return;
  183. }
  184. }
  185. for (i = 0; i < BTES_PER_NODE; i++) {
  186. bh_error = err_nodepda->bte_if[i].bh_error;
  187. if (bh_error != BTE_SUCCESS) {
  188. /* There is an error which needs to be notified */
  189. notify = err_nodepda->bte_if[i].most_rcnt_na;
  190. BTE_PRINTK(("cnode %d bte %d error=0x%lx\n",
  191. err_nodepda->bte_if[i].bte_cnode,
  192. err_nodepda->bte_if[i].bte_num,
  193. IBLS_ERROR | (u64) bh_error));
  194. *notify = IBLS_ERROR | bh_error;
  195. err_nodepda->bte_if[i].bh_error = BTE_SUCCESS;
  196. }
  197. err_nodepda->bte_if[i].cleanup_active = 0;
  198. BTE_PRINTK(("eh:%p:%d Unlocked %d\n", err_nodepda,
  199. smp_processor_id(), i));
  200. spin_unlock(&err_nodepda->bte_if[i].spinlock);
  201. }
  202. spin_unlock_irqrestore(recovery_lock, irq_flags);
  203. }
  204. /*
  205. * First part error handler. This is called whenever any error CRB interrupt
  206. * is generated by the II.
  207. */
  208. void
  209. bte_crb_error_handler(cnodeid_t cnode, int btenum,
  210. int crbnum, ioerror_t * ioe, int bteop)
  211. {
  212. struct bteinfo_s *bte;
  213. bte = &(NODEPDA(cnode)->bte_if[btenum]);
  214. /*
  215. * The caller has already figured out the error type, we save that
  216. * in the bte handle structure for the thread exercising the
  217. * interface to consume.
  218. */
  219. bte->bh_error = ioe->ie_errortype + BTEFAIL_OFFSET;
  220. bte->bte_error_count++;
  221. BTE_PRINTK(("Got an error on cnode %d bte %d: HW error type 0x%x\n",
  222. bte->bte_cnode, bte->bte_num, ioe->ie_errortype));
  223. bte_error_handler(NODEPDA(cnode));
  224. }