cobra bots: handle stalled connection by disconnecting and reconnecting instead of quitting, and expecting kubernete to restart us

This commit is contained in:
Benjamin Sergeant 2020-06-09 21:39:37 -07:00
parent e49bf24d2d
commit ecfca1f905
2 changed files with 29 additions and 5 deletions

View File

@ -44,6 +44,7 @@ namespace ix
std::atomic<bool> stop(false);
std::atomic<bool> throttled(false);
std::atomic<bool> fatalCobraError(false);
std::atomic<bool> stalledConnection(false);
int minuteCounter = 0;
auto timer = [&sentCount,
@ -95,7 +96,13 @@ namespace ix
std::thread t1(timer);
auto heartbeat = [&sentCount, &receivedCount, &stop, &enableHeartbeat, &heartBeatTimeout, &fatalCobraError] {
auto heartbeat = [&sentCount,
&receivedCount,
&stop,
&enableHeartbeat,
&heartBeatTimeout,
&stalledConnection]
{
setThreadName("Bot heartbeat");
std::string state("na");
@ -111,9 +118,12 @@ namespace ix
if (currentState == state)
{
CoreLogger::error("no messages received or sent for 1 minute, exiting");
fatalCobraError = true;
break;
ss.str("");
ss << "no messages received or sent for "
<< heartBeatTimeout << " seconds, reconnecting";
CoreLogger::error(ss.str());
stalledConnection = true;
}
state = currentState;
@ -234,6 +244,13 @@ namespace ix
std::this_thread::sleep_for(duration);
if (fatalCobraError) break;
if (stalledConnection)
{
conn.disconnect();
conn.connect();
stalledConnection = false;
}
}
}
// Run for a duration, used by unittesting now
@ -245,6 +262,13 @@ namespace ix
std::this_thread::sleep_for(duration);
if (fatalCobraError) break;
if (stalledConnection)
{
conn.disconnect();
conn.connect();
stalledConnection = false;
}
}
}

View File

@ -123,7 +123,7 @@ namespace ix
}
// periodically display all device ids
if (sentCount % 100 == 0)
if (sentCount % 1000 == 0)
{
ss.str(""); // reset the stringstream
ss << "## " << deviceIdCounters.size() << " unique device ids ##" << std::endl;