Skip to content

Commit 39cba20

Browse files
committed
Force reboot without visible errors for core wakeup failure
The intermittent core wakeup failure continues to plague us with no solution in sight. Since the error is extremely rare (less than 1% of boots) we have decided to force a manual reboot and not log any visible errors to the customer. Change-Id: Ic30f6330431bd2c8ce75075befc2c36d278d8152 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/71319 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
1 parent 8a977a1 commit 39cba20

File tree

1 file changed

+34
-9
lines changed

1 file changed

+34
-9
lines changed

src/usr/isteps/istep16/call_host_activate_slave_cores.C

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
#include <scom/scomif.H>
5252
#include <errl/errludprintk.H>
5353
#include <intr/intr_reasoncodes.H>
54+
#include <initservice/istepdispatcherif.H>
5455

5556
using namespace ERRORLOG;
5657
using namespace TARGETING;
@@ -84,6 +85,9 @@ void* call_host_activate_slave_cores (void *io_pArgs)
8485
assert( sys != NULL );
8586
uint32_t l_numCores = 0;
8687

88+
// keep track of which cores started
89+
TargetHandleList l_startedCores;
90+
8791
for(TargetHandleList::const_iterator
8892
l_core = l_cores.begin();
8993
l_core != l_cores.end();
@@ -233,7 +237,8 @@ void* call_host_activate_slave_cores (void *io_pArgs)
233237
// Add interesting ISTEP traces
234238
l_errl->collectTrace(ISTEP_COMP_NAME,256);
235239

236-
l_stepError.addErrorDetails( l_errl );
240+
// Choosing to ignore this intermittent error
241+
l_errl->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL);
237242
errlCommit( l_errl, HWPF_COMP_ID );
238243

239244
// Remember that we failed so we can gard the core if it
@@ -242,6 +247,14 @@ void* call_host_activate_slave_cores (void *io_pArgs)
242247
(*l_core)->
243248
setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
244249

250+
#ifdef CONFIG_BMC_IPMI
251+
// Initiate a graceful power cycle
252+
CONSOLE::displayf(ISTEP_COMP_NAME, "System Rebooting To Retry Recoverable Error");
253+
CONSOLE::flush();
254+
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,"call_host_activate_slave_cores: requesting power cycle");
255+
INITSERVICE::requestReboot();
256+
#endif
257+
245258
break;
246259
}
247260
// Create unrecoverable error log if this is a repeat
@@ -306,17 +319,29 @@ void* call_host_activate_slave_cores (void *io_pArgs)
306319
// Zero out the counter if we passed
307320
else if( l_prevFail > 0 )
308321
{
309-
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
310-
"call_host_activate_slave_cores: "
311-
"Resetting failure count for core %.8X",
312-
TARGETING::get_huid(*l_core) );
313-
l_prevFail = 0;
314-
(*l_core)->
315-
setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
322+
// Add to the list of passing cores so we can
323+
// clear ATTR_PREVIOUS_WAKEUP_FAIL later
324+
l_startedCores.push_back(*l_core);
316325
}
317326
}
318327
}
319-
// @@@@@ END CUSTOM BLOCK: @@@@@
328+
329+
// Clear out the wakeup_fail indicators only after every core has passed.
330+
// Doing this outside the loop helps mitigate the (unlikely) case where
331+
// a failure bounces between different cores on several consecutive boots.
332+
for(TargetHandleList::const_iterator
333+
l_core = l_startedCores.begin();
334+
l_core != l_startedCores.end();
335+
++l_core)
336+
{
337+
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
338+
"call_host_activate_slave_cores: "
339+
"Resetting failure count for core %.8X",
340+
TARGETING::get_huid(*l_core) );
341+
ATTR_PREVIOUS_WAKEUP_FAIL_type l_prevFail = 0;
342+
(*l_core)->
343+
setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
344+
}
320345

321346
#if defined(CONFIG_IPLTIME_CHECKSTOP_ANALYSIS) && !defined(__HOSTBOOT_RUNTIME)
322347
if( l_stepError.isNull() )

0 commit comments

Comments
 (0)