Skip to content

Commit dad96d1

Browse files
committed
Add retry to slave core wakeup path
We are still seeing some very intermittent errors in the slave core wakeup path. It still seems like we may have a timing issue. Until we figure out exactly what is going on, I am adding a retry mechanism that should get the core to report in correctly. The retry is done by issuing an additional doorbell message to the core that didn't report in. Change-Id: Ib87e5d58e079674d1eebb44c10d0252a35ea0519 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70762 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
1 parent 2fcb5a9 commit dad96d1

File tree

7 files changed

+101
-7
lines changed

7 files changed

+101
-7
lines changed

src/include/kernel/cpumgr.H

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
/* */
66
/* OpenPOWER HostBoot Project */
77
/* */
8-
/* Contributors Listed Below - COPYRIGHT 2010,2017 */
8+
/* Contributors Listed Below - COPYRIGHT 2010,2019 */
99
/* [+] International Business Machines Corp. */
1010
/* */
1111
/* */
@@ -146,6 +146,15 @@ class CpuManager
146146
static void startCore(uint64_t pir,uint64_t i_threads);
147147

148148

149+
/** @fn wakeupCore
150+
* Start the core, can only be run after startCore.
151+
*
152+
* @param[in] pir - PIR value of first thread in core.
153+
* @param[in] i_threads - Bitstring of threads to enable (left-justified).
154+
*/
155+
static void wakeupCore(uint64_t pir,uint64_t i_threads);
156+
157+
149158
/** @fn forceMemoryPeriodic()
150159
* Force the memory free / coalesce operations to be performed on the
151160
* next "periodic" interval.

src/include/kernel/syscalls.H

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
/* */
66
/* OpenPOWER HostBoot Project */
77
/* */
8-
/* Contributors Listed Below - COPYRIGHT 2010,2018 */
8+
/* Contributors Listed Below - COPYRIGHT 2010,2019 */
99
/* [+] International Business Machines Corp. */
1010
/* */
1111
/* */
@@ -99,6 +99,8 @@ namespace Systemcalls
9999
MISC_CPUNAP,
100100
/** cpu_master_winkle() */
101101
MISC_CPUWINKLE,
102+
/** cpu_wakeup_core() */
103+
MISC_CPUWAKEUPCORE,
102104

103105
/** mm_alloc_block() */
104106
MM_ALLOC_BLOCK,

src/include/sys/misc.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
/* */
66
/* OpenPOWER HostBoot Project */
77
/* */
8-
/* Contributors Listed Below - COPYRIGHT 2011,2018 */
8+
/* Contributors Listed Below - COPYRIGHT 2011,2019 */
99
/* [+] International Business Machines Corp. */
1010
/* */
1111
/* */
@@ -239,6 +239,22 @@ int cpu_master_winkle(bool i_fusedCores);
239239
*/
240240
int cpu_all_winkle();
241241

242+
/** @fn cpu_wakeup_core
243+
* @brief Have the kernel wakeup a core that was previously started.
244+
*
245+
* @param[in] pir - PIR value of the first thread on the core.
246+
* @param[in] i_threads - Bitstring of threads to enable (left-justified).
247+
*
248+
* @note The kernel will wakeup all threads on the requested core even
249+
* though the callee only requests with a single PIR value.
250+
*
251+
* @return 0 or -(errno) on failure.
252+
*
253+
* @retval -ENXIO - The core ID was outside of the range the kernel is
254+
* prepared to support.
255+
*/
256+
int cpu_wakeup_core(uint64_t pir,uint64_t i_threads);
257+
242258
/** @fn cpu_crit_assert
243259
* @brief Forces a Terminate Immediate after a crit-assert is issued
244260
* @param[in] i_failAddr - value in the linkRegister of the address

src/kernel/cpumgr.C

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
/* */
66
/* OpenPOWER HostBoot Project */
77
/* */
8-
/* Contributors Listed Below - COPYRIGHT 2010,2018 */
8+
/* Contributors Listed Below - COPYRIGHT 2010,2019 */
99
/* [+] International Business Machines Corp. */
1010
/* */
1111
/* */
@@ -446,7 +446,7 @@ void CpuManager::startCore(uint64_t pir,uint64_t i_threads)
446446
// Only wakeup the threads we were told to wakeup
447447
if( i_threads & (0x8000000000000000 >> i) )
448448
{
449-
printk("Dbell pir 0x%lx\n", pir + i);
449+
printk("Dbell:0x%lx\n", pir + i);
450450
//Initiate the Doorbell for this core/pir
451451
send_doorbell_wakeup(pir + i);
452452
}
@@ -455,6 +455,39 @@ void CpuManager::startCore(uint64_t pir,uint64_t i_threads)
455455
return;
456456
};
457457

458+
void CpuManager::wakeupCore(uint64_t pir,uint64_t i_threads)
459+
{
460+
size_t threads = getThreadCount();
461+
pir = pir & ~(threads-1);
462+
463+
if (pir >=
464+
(KERNEL_MAX_SUPPORTED_NODES * KERNEL_MAX_SUPPORTED_CPUS_PER_NODE))
465+
{
466+
TASK_SETRTN(TaskManager::getCurrentTask(), -ENXIO);
467+
return;
468+
}
469+
470+
//Send a message to userspace that a core with this base pir is being added
471+
// userspace will know which threads on the core to expect already
472+
InterruptMsgHdlr::addCpuCore(pir);
473+
474+
// Physically wakeup the threads with doorbells
475+
// Assumption is that startCore has already run so all
476+
// internal structures are setup
477+
for(size_t i = 0; i < threads; i++)
478+
{
479+
// Only wakeup the threads we were told to wakeup
480+
if( i_threads & (0x8000000000000000 >> i) )
481+
{
482+
printk("Dbell2:0x%lx\n", pir + i);
483+
//Initiate the Doorbell for this core/pir
484+
doorbell_send(pir + i);
485+
}
486+
}
487+
488+
return;
489+
};
490+
458491
size_t CpuManager::getThreadCount()
459492
{
460493
size_t threads = 0;

src/kernel/syscall.C

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ namespace KernelIpc
5555
extern "C"
5656
void kernel_execute_hype_doorbell()
5757
{
58+
printkd("hyp_doorbell on %lx\n", getPIR());
59+
5860
// Per POWER ISA Section 5.9.2, to avoid any weak consistency
5961
// issues we must use a msgsync instruction before consuming
6062
// any data set by a different thread following a doorbell
@@ -141,6 +143,7 @@ namespace Systemcalls
141143
void CpuSprSet(task_t *t);
142144
void CpuNap(task_t *t);
143145
void CpuWinkle(task_t *t);
146+
void CpuWakeupCore(task_t *t);
144147
void MmAllocBlock(task_t *t);
145148
void MmRemovePages(task_t *t);
146149
void MmSetPermission(task_t *t);
@@ -184,6 +187,7 @@ namespace Systemcalls
184187
&CpuSprSet, // MISC_CPUSPRSET
185188
&CpuNap, // MISC_CPUNAP
186189
&CpuWinkle, // MISC_CPUWINKLE
190+
&CpuWakeupCore, // MISC_CPUWAKEUPCORE
187191

188192
&MmAllocBlock, // MM_ALLOC_BLOCK
189193
&MmRemovePages, // MM_REMOVE_PAGES
@@ -848,6 +852,13 @@ namespace Systemcalls
848852
}
849853
}
850854

855+
/** Force thread wakeup via doorbell. */
856+
void CpuWakeupCore(task_t *t)
857+
{
858+
CpuManager::wakeupCore(static_cast<uint64_t>(TASK_GETARG0(t)),
859+
static_cast<uint64_t>(TASK_GETARG1(t)));
860+
};
861+
851862
/**
852863
* Allocate a block of virtual memory within the base segment
853864
* @param[in] t: The task used to allocate a block in the base segment

src/lib/syscall_misc.C

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
/* */
66
/* OpenPOWER HostBoot Project */
77
/* */
8-
/* Contributors Listed Below - COPYRIGHT 2011,2018 */
8+
/* Contributors Listed Below - COPYRIGHT 2011,2019 */
99
/* [+] International Business Machines Corp. */
1010
/* */
1111
/* */
@@ -132,6 +132,14 @@ int cpu_all_winkle()
132132
return rc;
133133
}
134134

135+
int cpu_wakeup_core(uint64_t pir,uint64_t i_threads)
136+
{
137+
return reinterpret_cast<int64_t>(
138+
_syscall2(MISC_CPUWAKEUPCORE,
139+
reinterpret_cast<void*>(pir),
140+
reinterpret_cast<void*>(i_threads)));
141+
}
142+
135143

136144
void cpu_crit_assert(uint64_t i_failAddr)
137145
{

src/usr/isteps/istep16/call_host_activate_slave_cores.C

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
/* */
66
/* OpenPOWER HostBoot Project */
77
/* */
8-
/* Contributors Listed Below - COPYRIGHT 2015,2018 */
8+
/* Contributors Listed Below - COPYRIGHT 2015,2019 */
99
/* [+] International Business Machines Corp. */
1010
/* */
1111
/* */
@@ -122,6 +122,18 @@ void* call_host_activate_slave_cores (void *io_pArgs)
122122

123123
int rc = cpu_start_core(pir, en_threads);
124124

125+
// Workaround to handle some syncing issues with new cpus
126+
// waking
127+
if (-ETIME == rc)
128+
{
129+
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
130+
"call_host_activate_slave_cores: "
131+
"Time out rc from kernel %d on core 0x%x, resending doorbell",
132+
rc,
133+
pir);
134+
rc = cpu_wakeup_core(pir,en_threads);
135+
}
136+
125137
// Handle time out error
126138
uint32_t l_checkidle_eid = 0;
127139
if (-ETIME == rc)
@@ -208,6 +220,9 @@ void* call_host_activate_slave_cores (void *io_pArgs)
208220
// Throw printk in there too in case it is a kernel issue
209221
ERRORLOG::ErrlUserDetailsPrintk().addToLog(l_errl);
210222

223+
// Add interesting ISTEP traces
224+
l_errl->collectTrace(ISTEP_COMP_NAME,256);
225+
211226
l_stepError.addErrorDetails( l_errl );
212227
errlCommit( l_errl, HWPF_COMP_ID );
213228
break;

0 commit comments

Comments
 (0)