Do not gard cores on the initial core wakeup failure

dcrowell77 · dcrowell77 · commit 753249ad9bd9 · 2019-01-29T16:01:39.000-06:00
We have seen rare (but non-zero) errors during slave core wakeup where we never see the new core reporting in. Currently this will result in a visible log and a core gard. However, there is currently no indication this failure is actually due to bad hardware. As a workaround, this commit adds an indicator that keeps track of if a core has failed wakeup previously. The first time we encounter the error there will be a visible log with a FW callout and no deconfig or gard of the core. That will trigger a boot failure and a reboot. If we don't fail on the next boot (which is expected), the counter will be cleared. If we do fail again there will be a visible log (with a new SRC) that calls out the core as the primary cause, plus does a deconfig+gard. Change-Id: I3a25537cf9c9c8e0b679519b67e9ae4e3492736d Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70993 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
diff --git a/src/include/usr/isteps/istep_reasoncodes.H b/src/include/usr/isteps/istep_reasoncodes.H
@@ -118,7 +118,21 @@ namespace ISTEP
         RC_FAILED_TO_BOOT_SBE                    = ISTEP_COMP_ID | 0x38,
         RC_REDISCOVERED_TARGETS                  = ISTEP_COMP_ID | 0x39,
         RC_P9N_LESS_THAN_DD22_NOT_SUPPORTED      = ISTEP_COMP_ID | 0x3A,
-        RC_PNOR_IPMI_NOT_ENABLED                 = ISTEP_COMP_ID | 0x3B,
+        RC_FREQ_ATTR_TIMER_EXPIRED               = ISTEP_COMP_ID | 0x40,
+        RC_FREQ_ATTR_TIMER_THREAD_FAIL           = ISTEP_COMP_ID | 0x41,
+        RC_FLOOR_FREQ_MISMATCH                   = ISTEP_COMP_ID | 0x42,
+        RC_CEIL_FREQ_MISMATCH                    = ISTEP_COMP_ID | 0x43,
+        RC_TURBO_FREQ_MISMATCH                   = ISTEP_COMP_ID | 0x44,
+        RC_ULTRA_TURBO_FREQ_MISMATCH             = ISTEP_COMP_ID | 0x45,
+        RC_NEST_FREQ_MISMATCH                    = ISTEP_COMP_ID | 0x46,
+        RC_NO_VALID_MEM_CONFIG                   = ISTEP_COMP_ID | 0x47,
+        RC_MASTER_GET_SBE_BOOT_SEEPROM_FAIL      = ISTEP_COMP_ID | 0x48,
+        RC_SLAVE_GET_SBE_BOOT_SEEPROM_FAIL       = ISTEP_COMP_ID | 0x49,
+        RC_LINK_TRAIN_ERRORS_FROM_HWP            = ISTEP_COMP_ID | 0x4A,
+        RC_RISK_LEVEL_TOO_LOW                    = ISTEP_COMP_ID | 0x4B,
+        RC_INVALID_HX_KEYWORD_DATA               = ISTEP_COMP_ID | 0x4C,
+        RC_PNOR_IPMI_NOT_ENABLED                 = ISTEP_COMP_ID | 0x4D,
+        RC_SLAVE_CORE_WAKEUP_ERROR               = ISTEP_COMP_ID | 0x4E,
     };
 
 };
diff --git a/src/usr/isteps/istep16/call_host_activate_slave_cores.C b/src/usr/isteps/istep16/call_host_activate_slave_cores.C
@@ -179,8 +179,13 @@ void* call_host_activate_slave_cores (void *io_pArgs)
                 }
             } // End of handle time out error
 
-            // Create error log
-            if (0 != rc)
+            // Check if this core failed last time
+            ATTR_PREVIOUS_WAKEUP_FAIL_type l_prevFail =
+              (*l_core)->getAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>();
+
+            // Create predictive error log if this is the first failure
+            //   AND the HWP didn't see a problem
+            if( (0 != rc) && (l_prevFail == 0) && (l_checkidle_eid == 0) )
             {
                 TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
                         "call_host_activate_slave_cores: "
@@ -208,11 +213,75 @@ void* call_host_activate_slave_cores (void *io_pArgs)
                                  l_checkidle_eid,
                                  rc) );
 
+                // Going to assume some kind of SW error unless it fails
+                //  again
+                l_errl->addProcedureCallout( HWAS::EPUB_PRC_HB_CODE,
+                                             HWAS::SRCI_PRIORITY_HIGH);
+
                 // Callout core that failed to wake up.
                 l_errl->addHwCallout(*l_core,
-                        HWAS::SRCI_PRIORITY_MED,
-                        HWAS::DECONFIG,
-                        HWAS::GARD_Predictive);
+                        HWAS::SRCI_PRIORITY_LOW,
+                        HWAS::NO_DECONFIG,
+                        HWAS::GARD_NULL);
+
+                // Could be an interrupt issue
+                l_errl->collectTrace(INTR_TRACE_NAME,256);
+
+                // Throw printk in there too in case it is a kernel issue
+                ERRORLOG::ErrlUserDetailsPrintk().addToLog(l_errl);
+
+                // Add interesting ISTEP traces
+                l_errl->collectTrace(ISTEP_COMP_NAME,256);
+
+                l_stepError.addErrorDetails( l_errl );
+                errlCommit( l_errl, HWPF_COMP_ID );
+
+                // Remember that we failed so we can gard the core if it
+                //  happens again on the reboot
+                l_prevFail = 1;
+                (*l_core)->
+                  setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+
+                break;
+            }
+            // Create unrecoverable error log if this is a repeat
+            //  OR if the HWP hit something
+            else if( (0 != rc) &&
+                     ((l_prevFail > 0) || (l_checkidle_eid != 0)) )
+            {
+                TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
+                           "call_host_activate_slave_cores: "
+                           "Core errors during wakeup on core %x",
+                           pir);
+                /*@
+                 * @errortype
+                 * @reasoncode  RC_SLAVE_CORE_WAKEUP_ERROR
+                 * @severity    ERRORLOG::ERRL_SEV_UNRECOVERABLE
+                 * @moduleid    MOD_HOST_ACTIVATE_SLAVE_CORES
+                 * @userdata1[00:31]   PIR of failing core.
+                 * @userdata2[32:63]   Number of previous failures.
+                 * @userdata2[00:31]   EID from p9_check_idle_stop_done().
+                 * @userdata2[32:63]   rc of cpu_start_core().
+                 *
+                 * @devdesc Kernel returned error when trying to activate
+                 *          core.
+                 */
+                l_errl = new ERRORLOG::ErrlEntry(
+                               ERRORLOG::ERRL_SEV_UNRECOVERABLE,
+                               MOD_HOST_ACTIVATE_SLAVE_CORES,
+                               RC_SLAVE_CORE_WAKEUP_ERROR,
+                               TWO_UINT32_TO_UINT64(
+                                   pir,
+                                   l_prevFail),
+                               TWO_UINT32_TO_UINT64(
+                                   l_checkidle_eid,
+                                   rc) );
+
+                // Callout and gard core that failed to wake up.
+                l_errl->addHwCallout(*l_core,
+                                     HWAS::SRCI_PRIORITY_HIGH,
+                                     HWAS::DECONFIG,
+                                     HWAS::GARD_Predictive);
 
                 // Could be an interrupt issue
                 l_errl->collectTrace(INTR_TRACE_NAME,256);
@@ -225,8 +294,26 @@ void* call_host_activate_slave_cores (void *io_pArgs)
 
                 l_stepError.addErrorDetails( l_errl );
                 errlCommit( l_errl, HWPF_COMP_ID );
+
+                // We garded the core so we should zero out the fail
+                //  counter so the replacement doesn't get blamed
+                l_prevFail = 0;
+                (*l_core)->
+                  setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+
                 break;
             }
+            // Zero out the counter if we passed 
+            else if( l_prevFail > 0 )
+            {
+                TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
+                           "call_host_activate_slave_cores: "
+                           "Resetting failure count for core %.8X",
+                           TARGETING::get_huid(*l_core) );
+                l_prevFail = 0;
+                (*l_core)->
+                  setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+            }
         }
     }
     // @@@@@    END CUSTOM BLOCK:   @@@@@
diff --git a/src/usr/targeting/common/xmltohb/attribute_types.xml b/src/usr/targeting/common/xmltohb/attribute_types.xml
@@ -5,7 +5,7 @@
 <!--                                                                        -->
 <!-- OpenPOWER HostBoot Project                                             -->
 <!--                                                                        -->
-<!-- Contributors Listed Below - COPYRIGHT 2012,2018                        -->
+<!-- Contributors Listed Below - COPYRIGHT 2012,2019                        -->
 <!-- [+] International Business Machines Corp.                              -->
 <!--                                                                        -->
 <!--                                                                        -->
@@ -6755,6 +6755,24 @@ Selects which voltage level to place the Core and ECO domain PFETs upon Winkle e
     <writeable/>
 </attribute>
 
+<attribute>
+  <description>
+     Tracks if a specific core has previously experienced a timeout during
+     initial activation.
+        0 = No previous errors reported; 
+        1 = Core failed on the last attempt to be started
+  </description>
+  <id>PREVIOUS_WAKEUP_FAIL</id>
+  <persistency>non-volatile</persistency>
+  <readable/>
+  <writeable/>
+  <simpleType>
+    <uint8_t>
+      <default>0</default>
+    </uint8_t>
+  </simpleType>
+  <no_export/>
+</attribute>
 
 <attribute>
     <id>SLOT_NAME</id>
diff --git a/src/usr/targeting/common/xmltohb/target_types.xml b/src/usr/targeting/common/xmltohb/target_types.xml
@@ -5,7 +5,7 @@
 <!--                                                                        -->
 <!-- OpenPOWER HostBoot Project                                             -->
 <!--                                                                        -->
-<!-- Contributors Listed Below - COPYRIGHT 2012,2018                        -->
+<!-- Contributors Listed Below - COPYRIGHT 2012,2019                        -->
 <!-- [+] Google Inc.                                                        -->
 <!-- [+] International Business Machines Corp.                              -->
 <!--                                                                        -->
@@ -1316,6 +1316,9 @@
     <default>CPU</default>
     <id>CDM_DOMAIN</id>
   </attribute>
+  <attribute>
+    <id>PREVIOUS_WAKEUP_FAIL</id>
+  </attribute>
 </targetType>
 
 <!-- MCS