Skip to content

Commit 753249a

Browse files
committed
Do not gard cores on the initial core wakeup failure
We have seen rare (but non-zero) errors during slave core wakeup where we never see the new core reporting in. Currently this will result in a visible log and a core gard. However, there is currently no indication this failure is actually due to bad hardware. As a workaround, this commit adds an indicator that keeps track of if a core has failed wakeup previously. The first time we encounter the error there will be a visible log with a FW callout and no deconfig or gard of the core. That will trigger a boot failure and a reboot. If we don't fail on the next boot (which is expected), the counter will be cleared. If we do fail again there will be a visible log (with a new SRC) that calls out the core as the primary cause, plus does a deconfig+gard. Change-Id: I3a25537cf9c9c8e0b679519b67e9ae4e3492736d Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70993 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
1 parent 10dbf93 commit 753249a

File tree

4 files changed

+130
-8
lines changed

4 files changed

+130
-8
lines changed

src/include/usr/isteps/istep_reasoncodes.H

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,21 @@ namespace ISTEP
118118
RC_FAILED_TO_BOOT_SBE = ISTEP_COMP_ID | 0x38,
119119
RC_REDISCOVERED_TARGETS = ISTEP_COMP_ID | 0x39,
120120
RC_P9N_LESS_THAN_DD22_NOT_SUPPORTED = ISTEP_COMP_ID | 0x3A,
121-
RC_PNOR_IPMI_NOT_ENABLED = ISTEP_COMP_ID | 0x3B,
121+
RC_FREQ_ATTR_TIMER_EXPIRED = ISTEP_COMP_ID | 0x40,
122+
RC_FREQ_ATTR_TIMER_THREAD_FAIL = ISTEP_COMP_ID | 0x41,
123+
RC_FLOOR_FREQ_MISMATCH = ISTEP_COMP_ID | 0x42,
124+
RC_CEIL_FREQ_MISMATCH = ISTEP_COMP_ID | 0x43,
125+
RC_TURBO_FREQ_MISMATCH = ISTEP_COMP_ID | 0x44,
126+
RC_ULTRA_TURBO_FREQ_MISMATCH = ISTEP_COMP_ID | 0x45,
127+
RC_NEST_FREQ_MISMATCH = ISTEP_COMP_ID | 0x46,
128+
RC_NO_VALID_MEM_CONFIG = ISTEP_COMP_ID | 0x47,
129+
RC_MASTER_GET_SBE_BOOT_SEEPROM_FAIL = ISTEP_COMP_ID | 0x48,
130+
RC_SLAVE_GET_SBE_BOOT_SEEPROM_FAIL = ISTEP_COMP_ID | 0x49,
131+
RC_LINK_TRAIN_ERRORS_FROM_HWP = ISTEP_COMP_ID | 0x4A,
132+
RC_RISK_LEVEL_TOO_LOW = ISTEP_COMP_ID | 0x4B,
133+
RC_INVALID_HX_KEYWORD_DATA = ISTEP_COMP_ID | 0x4C,
134+
RC_PNOR_IPMI_NOT_ENABLED = ISTEP_COMP_ID | 0x4D,
135+
RC_SLAVE_CORE_WAKEUP_ERROR = ISTEP_COMP_ID | 0x4E,
122136
};
123137

124138
};

src/usr/isteps/istep16/call_host_activate_slave_cores.C

Lines changed: 92 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,13 @@ void* call_host_activate_slave_cores (void *io_pArgs)
179179
}
180180
} // End of handle time out error
181181

182-
// Create error log
183-
if (0 != rc)
182+
// Check if this core failed last time
183+
ATTR_PREVIOUS_WAKEUP_FAIL_type l_prevFail =
184+
(*l_core)->getAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>();
185+
186+
// Create predictive error log if this is the first failure
187+
// AND the HWP didn't see a problem
188+
if( (0 != rc) && (l_prevFail == 0) && (l_checkidle_eid == 0) )
184189
{
185190
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
186191
"call_host_activate_slave_cores: "
@@ -208,11 +213,75 @@ void* call_host_activate_slave_cores (void *io_pArgs)
208213
l_checkidle_eid,
209214
rc) );
210215

216+
// Going to assume some kind of SW error unless it fails
217+
// again
218+
l_errl->addProcedureCallout( HWAS::EPUB_PRC_HB_CODE,
219+
HWAS::SRCI_PRIORITY_HIGH);
220+
211221
// Callout core that failed to wake up.
212222
l_errl->addHwCallout(*l_core,
213-
HWAS::SRCI_PRIORITY_MED,
214-
HWAS::DECONFIG,
215-
HWAS::GARD_Predictive);
223+
HWAS::SRCI_PRIORITY_LOW,
224+
HWAS::NO_DECONFIG,
225+
HWAS::GARD_NULL);
226+
227+
// Could be an interrupt issue
228+
l_errl->collectTrace(INTR_TRACE_NAME,256);
229+
230+
// Throw printk in there too in case it is a kernel issue
231+
ERRORLOG::ErrlUserDetailsPrintk().addToLog(l_errl);
232+
233+
// Add interesting ISTEP traces
234+
l_errl->collectTrace(ISTEP_COMP_NAME,256);
235+
236+
l_stepError.addErrorDetails( l_errl );
237+
errlCommit( l_errl, HWPF_COMP_ID );
238+
239+
// Remember that we failed so we can gard the core if it
240+
// happens again on the reboot
241+
l_prevFail = 1;
242+
(*l_core)->
243+
setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
244+
245+
break;
246+
}
247+
// Create unrecoverable error log if this is a repeat
248+
// OR if the HWP hit something
249+
else if( (0 != rc) &&
250+
((l_prevFail > 0) || (l_checkidle_eid != 0)) )
251+
{
252+
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
253+
"call_host_activate_slave_cores: "
254+
"Core errors during wakeup on core %x",
255+
pir);
256+
/*@
257+
* @errortype
258+
* @reasoncode RC_SLAVE_CORE_WAKEUP_ERROR
259+
* @severity ERRORLOG::ERRL_SEV_UNRECOVERABLE
260+
* @moduleid MOD_HOST_ACTIVATE_SLAVE_CORES
261+
* @userdata1[00:31] PIR of failing core.
262+
* @userdata2[32:63] Number of previous failures.
263+
* @userdata2[00:31] EID from p9_check_idle_stop_done().
264+
* @userdata2[32:63] rc of cpu_start_core().
265+
*
266+
* @devdesc Kernel returned error when trying to activate
267+
* core.
268+
*/
269+
l_errl = new ERRORLOG::ErrlEntry(
270+
ERRORLOG::ERRL_SEV_UNRECOVERABLE,
271+
MOD_HOST_ACTIVATE_SLAVE_CORES,
272+
RC_SLAVE_CORE_WAKEUP_ERROR,
273+
TWO_UINT32_TO_UINT64(
274+
pir,
275+
l_prevFail),
276+
TWO_UINT32_TO_UINT64(
277+
l_checkidle_eid,
278+
rc) );
279+
280+
// Callout and gard core that failed to wake up.
281+
l_errl->addHwCallout(*l_core,
282+
HWAS::SRCI_PRIORITY_HIGH,
283+
HWAS::DECONFIG,
284+
HWAS::GARD_Predictive);
216285

217286
// Could be an interrupt issue
218287
l_errl->collectTrace(INTR_TRACE_NAME,256);
@@ -225,8 +294,26 @@ void* call_host_activate_slave_cores (void *io_pArgs)
225294

226295
l_stepError.addErrorDetails( l_errl );
227296
errlCommit( l_errl, HWPF_COMP_ID );
297+
298+
// We garded the core so we should zero out the fail
299+
// counter so the replacement doesn't get blamed
300+
l_prevFail = 0;
301+
(*l_core)->
302+
setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
303+
228304
break;
229305
}
306+
// Zero out the counter if we passed
307+
else if( l_prevFail > 0 )
308+
{
309+
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
310+
"call_host_activate_slave_cores: "
311+
"Resetting failure count for core %.8X",
312+
TARGETING::get_huid(*l_core) );
313+
l_prevFail = 0;
314+
(*l_core)->
315+
setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
316+
}
230317
}
231318
}
232319
// @@@@@ END CUSTOM BLOCK: @@@@@

src/usr/targeting/common/xmltohb/attribute_types.xml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<!-- -->
66
<!-- OpenPOWER HostBoot Project -->
77
<!-- -->
8-
<!-- Contributors Listed Below - COPYRIGHT 2012,2018 -->
8+
<!-- Contributors Listed Below - COPYRIGHT 2012,2019 -->
99
<!-- [+] International Business Machines Corp. -->
1010
<!-- -->
1111
<!-- -->
@@ -6755,6 +6755,24 @@ Selects which voltage level to place the Core and ECO domain PFETs upon Winkle e
67556755
<writeable/>
67566756
</attribute>
67576757

6758+
<attribute>
6759+
<description>
6760+
Tracks if a specific core has previously experienced a timeout during
6761+
initial activation.
6762+
0 = No previous errors reported;
6763+
1 = Core failed on the last attempt to be started
6764+
</description>
6765+
<id>PREVIOUS_WAKEUP_FAIL</id>
6766+
<persistency>non-volatile</persistency>
6767+
<readable/>
6768+
<writeable/>
6769+
<simpleType>
6770+
<uint8_t>
6771+
<default>0</default>
6772+
</uint8_t>
6773+
</simpleType>
6774+
<no_export/>
6775+
</attribute>
67586776

67596777
<attribute>
67606778
<id>SLOT_NAME</id>

src/usr/targeting/common/xmltohb/target_types.xml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<!-- -->
66
<!-- OpenPOWER HostBoot Project -->
77
<!-- -->
8-
<!-- Contributors Listed Below - COPYRIGHT 2012,2018 -->
8+
<!-- Contributors Listed Below - COPYRIGHT 2012,2019 -->
99
<!-- [+] Google Inc. -->
1010
<!-- [+] International Business Machines Corp. -->
1111
<!-- -->
@@ -1316,6 +1316,9 @@
13161316
<default>CPU</default>
13171317
<id>CDM_DOMAIN</id>
13181318
</attribute>
1319+
<attribute>
1320+
<id>PREVIOUS_WAKEUP_FAIL</id>
1321+
</attribute>
13191322
</targetType>
13201323

13211324
<!-- MCS

0 commit comments

Comments
 (0)