Skip to content

Commit b616250

Browse files
cjcaindcrowell77
authored andcommitted
HTMGT: Change OCC logs to info while recovery is still being attempted
Change-Id: I0a46cacbc7e473dedd38ce9656ab25f5452c77c1 CQ: SW456777 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/72062 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Sheldon Bailey <baileysh@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
1 parent 4c0019e commit b616250

File tree

2 files changed

+107
-63
lines changed

2 files changed

+107
-63
lines changed

src/usr/htmgt/htmgt_occ.H

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
/* */
66
/* OpenPOWER HostBoot Project */
77
/* */
8-
/* Contributors Listed Below - COPYRIGHT 2014,2018 */
8+
/* Contributors Listed Below - COPYRIGHT 2014,2019 */
99
/* [+] International Business Machines Corp. */
1010
/* */
1111
/* */
@@ -346,13 +346,19 @@ namespace HTMGT
346346
/**
347347
* @brief Determine what actions are required for elog
348348
*
349-
* @param[in] i_actions Action requested by OCC
350-
* @param[out] o_occReset returns true if OCC reset is needed
351-
* @param[out] o_errlSeverity severity to use for elog commit
352-
*/
353-
void elogProcessActions(const uint8_t i_actions,
354-
bool & o_occReset,
355-
ERRORLOG::errlSeverity_t & o_errlSeverity);
349+
* @param[in] i_actions Action flags requested by OCC
350+
* @param[in] i_src SRC being reported by OCC
351+
* @param[in] i_data Additional data used when
352+
* processing actions
353+
* @param[in,out] io_errlSeverity Severity to use for elog
354+
* @param[out] o_call_home True if info error should be
355+
* reported to BMC
356+
*/
357+
void elogProcessActions(const uint8_t i_actions,
358+
const uint32_t i_src,
359+
const uint32_t i_data,
360+
ERRORLOG::errlSeverity_t & io_errlSeverity,
361+
bool & o_call_home);
356362

357363

358364
/**

src/usr/htmgt/occError.C

Lines changed: 93 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
/* */
66
/* OpenPOWER HostBoot Project */
77
/* */
8-
/* Contributors Listed Below - COPYRIGHT 2014,2018 */
8+
/* Contributors Listed Below - COPYRIGHT 2014,2019 */
99
/* [+] International Business Machines Corp. */
1010
/* */
1111
/* */
@@ -109,7 +109,6 @@ namespace HTMGT
109109

110110
TMGT_BIN("OCC ELOG", l_occElog, 256);
111111

112-
113112
// Get user details section
114113
const occErrlUsrDtls_t *l_usrDtls_ptr = (occErrlUsrDtls_t *)
115114
((uint8_t*)l_occElog + sizeof(occErrlEntry_t));
@@ -118,6 +117,13 @@ namespace HTMGT
118117
ERRORLOG::errlSeverity_t severity =
119118
ERRORLOG::ERRL_SEV_INFORMATIONAL;
120119

120+
if (l_occSrc == 0x2A01)
121+
{
122+
// 2A01 is Periodic OCC Telemetry / Call Home data
123+
TMGT_INF("OCC is reporting Periodic Telemetry Data (0x2A01)"
124+
" - NOT AN ERROR");
125+
}
126+
121127
// Translate Severity
122128
const uint8_t l_occSeverity = l_occElog->severity;
123129
if (l_occSeverity < OCC_SEV_ACTION_XLATE_SIZE)
@@ -132,42 +138,12 @@ namespace HTMGT
132138
}
133139

134140
// Process Actions
135-
bool l_occReset = false;
136-
elogProcessActions(l_occElog->actions, l_occReset, severity);
137-
138-
139-
140-
// Need to add WOF reason code to OCC object regardless of
141-
// whether WOF resets are disabled.
142-
if( l_occElog->actions & TMGT_ERRL_ACTIONS_WOF_RESET_REQUIRED )
143-
{
144-
iv_wofResetReasons |= l_usrDtls_ptr->userData1;
145-
TMGT_ERR("WOF Reset Reasons for OCC%d = 0x%08x",
146-
iv_instance,
147-
iv_wofResetReasons);
148-
149-
}
150-
151-
// Check if we need a WOF requested reset
152-
if(iv_needsWofReset == true)
153-
{
154-
TMGT_ERR("WOF Reset detected! SRC = 0x%X",
155-
l_occSrc);
156-
157-
// We compare against one less than the threshold because
158-
// the WOF reset count doesn't get incremented until resetPrep
159-
if( iv_wofResetCount < (WOF_RESET_COUNT_THRESHOLD-1) )
160-
{
161-
// Not at WOF reset threshold yet. Set sev to INFO
162-
severity = ERRORLOG::ERRL_SEV_INFORMATIONAL;
163-
}
164-
}
165-
166-
if (l_occReset == true)
167-
{
168-
iv_needsReset = true;
169-
OccManager::updateSafeModeReason(l_occSrc, iv_instance);
170-
}
141+
bool l_call_home_event = false;
142+
elogProcessActions(l_occElog->actions,
143+
l_occSrc,
144+
l_usrDtls_ptr->userData1,
145+
severity,
146+
l_call_home_event);
171147

172148
// Create OCC error log
173149
// NOTE: word 4 (used by extended reason code) to save off OCC
@@ -185,6 +161,13 @@ namespace HTMGT
185161
l_occElog->extendedRC, // extended reason code
186162
severity);
187163

164+
if (l_call_home_event)
165+
{
166+
// Force info log to the BMC.
167+
// No HW Callouts (SELs) will be created for this error
168+
l_errlHndl->setEselCallhomeInfoEvent(true);
169+
}
170+
188171
// Add callout information
189172
const uint8_t l_max_callouts = l_occElog->maxCallouts;
190173
bool l_bad_fru_data = false;
@@ -295,16 +278,11 @@ namespace HTMGT
295278
"HALT_ON_SRC is set. Resets will be disabled",
296279
iv_instance, l_occSrc);
297280
set_int_flags(get_int_flags() | FLAG_RESET_DISABLED);
281+
// Force unrecoverable elog
282+
l_errlHndl->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE);
298283
}
299284
}
300285

301-
// Process force error log to be sent to BMC.
302-
if( (l_occElog->actions & TMGT_ERRL_ACTIONS_FORCE_ERROR_POSTED ) ||
303-
(l_occSrc == (OCCC_COMP_ID | 0x01 ) ) ) //GEN_CALLHOME_LOG
304-
{
305-
l_errlHndl->setEselCallhomeInfoEvent(true);
306-
}
307-
308286
#ifdef CONFIG_CONSOLE_OUTPUT_OCC_COMM
309287
char header[64];
310288
sprintf(header, "OCC%d ELOG: (0x%04X bytes)", iv_instance, i_length);
@@ -426,57 +404,117 @@ namespace HTMGT
426404

427405
} // end Occ::elogAddCallout()
428406

429-
void Occ::elogProcessActions(const uint8_t i_actions,
430-
bool & o_occReset,
431-
ERRORLOG::errlSeverity_t & o_errlSeverity)
407+
408+
void Occ::elogProcessActions(const uint8_t i_actions,
409+
const uint32_t i_src,
410+
uint32_t i_data,
411+
ERRORLOG::errlSeverity_t & io_errlSeverity,
412+
bool & o_call_home)
432413
{
414+
bool l_occReset = false;
415+
o_call_home = false;
416+
433417
if (i_actions & TMGT_ERRL_ACTIONS_WOF_RESET_REQUIRED)
434418
{
435419
iv_failed = false;
436420
iv_resetReason = OCC_RESET_REASON_WOF_REQUEST;
437421
// Check if WOF resets are disabled
438422
if(int_flags_set(FLAG_WOF_RESET_DISABLED) == true)
439423
{
440-
o_occReset = false;
441424
iv_needsWofReset = false;
442425
TMGT_INF("elogProcessActions: OCC%d requested a WOF reset "
443426
"but WOF resets are DISABLED",
444427
iv_instance);
445428
}
446429
else // WOF resets are enabled
447430
{
448-
o_occReset = true;
431+
l_occReset = true;
449432
iv_needsWofReset = true;
450-
TMGT_INF("elogProcessActions: OCC%d requested a WOF reset",
433+
TMGT_ERR("elogProcessActions: OCC%d requested a WOF reset",
451434
iv_instance);
435+
436+
// We compare against one less than the threshold because the
437+
// WOF reset count doesn't get incremented until the resetPrep
438+
if( iv_wofResetCount < (WOF_RESET_COUNT_THRESHOLD-1) )
439+
{
440+
// Not at WOF reset threshold yet. Set sev to INFO
441+
io_errlSeverity = ERRORLOG::ERRL_SEV_INFORMATIONAL;
442+
}
452443
}
444+
445+
// Need to add WOF reason code to OCC object regardless of
446+
// whether WOF resets are disabled.
447+
iv_wofResetReasons |= i_data;
448+
TMGT_ERR("elogProcessActions: WOF Reset Reasons for OCC%d = 0x%08x",
449+
iv_instance, iv_wofResetReasons);
453450
}
454451
else
455452
{
456453
if (i_actions & TMGT_ERRL_ACTIONS_RESET_REQUIRED)
457454
{
458-
o_occReset = true;
455+
l_occReset = true;
459456
iv_failed = true;
460457
iv_resetReason = OCC_RESET_REASON_OCC_REQUEST;
461458

462459
TMGT_INF("elogProcessActions: OCC%d requested reset",
463-
iv_instance);
460+
iv_instance);
461+
462+
// If reset will force safe mode, then make error unrecoverable
463+
if (OCC_RESET_COUNT_THRESHOLD == iv_resetCount)
464+
{
465+
if (io_errlSeverity != ERRORLOG::ERRL_SEV_UNRECOVERABLE)
466+
{
467+
// update severity to UNRECOVERABLE
468+
TMGT_ERR("elogProcessActions: changing severity to "
469+
"UNRECOVERABLE (was sev=0x%02X)",
470+
io_errlSeverity);
471+
io_errlSeverity = ERRORLOG::ERRL_SEV_UNRECOVERABLE;
472+
}
473+
}
474+
else if (io_errlSeverity != ERRORLOG::ERRL_SEV_INFORMATIONAL)
475+
{
476+
// update severity to INFO
477+
TMGT_INF("elogProcessActions: changing severity to "
478+
"INFORMATIONAL (was sev=0x%02X)",
479+
io_errlSeverity);
480+
io_errlSeverity = ERRORLOG::ERRL_SEV_INFORMATIONAL;
481+
// log will be sent to BMC with NO SEL (hardware callouts)
482+
o_call_home = true;
483+
}
464484
}
465485

466486
if (i_actions & TMGT_ERRL_ACTIONS_SAFE_MODE_REQUIRED)
467487
{
468-
o_occReset = true;
488+
l_occReset = true;
469489
iv_failed = true;
470490
iv_resetReason = OCC_RESET_REASON_CRIT_FAILURE;
471491
iv_resetCount = OCC_RESET_COUNT_THRESHOLD;
472492

473493
TMGT_INF("elogProcessActions: OCC%d requested safe mode",
474494
iv_instance);
475495
TMGT_CONSOLE("OCC%d requested system enter safe mode",
476-
iv_instance);
496+
iv_instance);
477497
}
478498
}
479499

500+
// Check if error needs to be forced to the BMC:
501+
// 1. 2A01 = OCC call home/telemetry data, OR
502+
// 2. OCC requested force, but error was changed to info by HTMGT
503+
// (log will be sent to the BMC with NO SEL (hardware callouts))
504+
if ( (i_src == (OCCC_COMP_ID | 0x01 )) || // GEN_CALLHOME_LOG
505+
( (i_actions & TMGT_ERRL_ACTIONS_FORCE_ERROR_POSTED) &&
506+
(io_errlSeverity == ERRORLOG::ERRL_SEV_INFORMATIONAL) ) )
507+
{
508+
o_call_home = true;
509+
}
510+
511+
// If reset required, save the SRC in case it leads to safe mode
512+
if (l_occReset == true)
513+
{
514+
iv_needsReset = true;
515+
OccManager::updateSafeModeReason(i_src, iv_instance);
516+
}
517+
480518
} // end Occ::elogProcessActions()
481519

482520
} // end namespace

0 commit comments

Comments
 (0)