sympathy_analyze.c from EmStar at Krugle
Show sympathy_analyze.c syntax highlighted
/* ex: set tabstop=2 expandtab shiftwidth=2 softtabstop=2: */
/*
*
* Copyright (c) 2003 The Regents of the University of California. All
* rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Neither the name of the University nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS''
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
/*
*
* Author: Nithya Ramanthan
*
*/
/* NR todo: fix #root-detections, can sometimes use
* outdated routes - dont just specify a node is a root
* just because its route is outdated! */
#include <sympathy.h>
#include <sim/radio.h>
static void check_sink_collisions();
static int check_errs_rx(sympathy_node_info_t* stat, stats_ctr_t* errors_rx, stats_ctr_t*
good_rx, buf_t* fault_buf);
/* For now just check if the next-hop is in the
* neighbor list */
static
void check_events(sympathy_node_info_t* stat)
{
if (stat->next_hop.next_hop == 0) return;
/* See if the next-hop selected is in the neighbor list */
if ((find_neighbor(stat, (Saddr_t) stat->next_hop.next_hop))< 0) {
bufprintf(stat->fault_buf, "ERROR: Next-hop: %d is not a neighbor!\n", stat->next_hop.next_hop);
}
}
static
void log_failed(int* error_events, int code)
{
*error_events |= S_CODE(code);
}
static
int node_has_neighbors(sympathy_node_info_t* stat)
{
return (stat->num_neighbors > 0);
}
static
int route_to_sink(sympathy_node_info_t* stat)
{
uint8_t i;
/* If this is the sink, see if anybody has a route to the sink */
if (stat->addr == my_node_id) {
for (i = 0; i < sink.num_srcs; i++) {
if (route_valid(&sink.status_srcs[i])) {
if (sink.status_srcs[i].next_hop.sink == my_node_id) return 1;
}
}
}
/* Otherwise just check this node */
else if (route_valid(stat))
{
return(stat->next_hop.sink == my_node_id);
}
return 0;
}
/* Node is claimed to have not been heard from IF: no nodes have it on their
* neighbor list AND they have current metrics */
static
int node_heard_from(sympathy_node_info_t* stat)
{
uint8_t i, j;
buf_t* neighbor_buf = buf_new();
stat->num_heard_this_node = 0;
/* Parse metrics to see if anybody claims node as a neighbor */
for (i = 0; i < sink.num_srcs; i++)
{
if (sink.status_srcs[i].addr != stat->addr)
{
for (j = 0; j < sink.status_srcs[i].num_neighbors; j++)
{
if (sink.status_srcs[i].neighbors[j].node_id
== stat->addr)
{
if (neighbors_valid(&sink.status_srcs[i]))
{
stat->num_heard_this_node++;
bufprintf(neighbor_buf, "%d,", sink.status_srcs[i].addr);
}
}
}
}
}
#ifdef USE_BAYES
if (stat->num_heard_this_node > stat->max_num_heard_this_node)
stat->max_num_heard_this_node = stat->num_heard_this_node;
#endif
if (neighbor_buf->len > 0) {
bufprintf(stat->topology_info, "Num neighbors heard this node: %d {%s}\n",
stat->num_heard_this_node, neighbor_buf->buf);
}
else bufprintf(stat->topology_info, "NO NEIGHBORS heard this node!\n");
buf_free(neighbor_buf);
return (stat->num_heard_this_node > 0);
}
static
void check_sink_collisions()
{
sympathy_node_info_t *stat = find_status_ptr(my_node_id);
if (check_errs_rx(stat, &stat->num_pkts_crc_error, &stat->num_pkts_rx, stat->fault_buf)) {
log_failed(&stat->error_events, S_NO_COLLISIONS);
stat->congestion_detected = 1;
}
}
static void update_counters(sympathy_node_info_t* stat, uint8_t clear)
{
int j;
sympathy_node_app_info_t* snode;
stats_ctr_update(&stat->metrics_rx,clear, 0);
stats_ctr_update(&stat->sympathy_stats_rx,clear, 0);
for (j = 0; j < NUM_TOS_PKT_TYPES; j++)
{
stats_ctr_update(&stat->tos_packets[j],clear, 0);
}
stats_ctr_update(&stat->errs_rx,clear, 0);
for (j = 0; j < sink.num_apps_registered; j++)
{
snode = &stat->node_app_info[j];
stats_ctr_update(&snode->node_num_pkts_rx,clear, 0);
stats_ctr_update(&snode->node_send_failures,clear, 0);
stats_ctr_update(&snode->node_max_queue_occupancy,clear, 0);
stats_ctr_update(&snode->node_num_pkts_dropped,clear, 0);
stats_ctr_update(&snode->node_num_pkts_tx,clear, 1);
stats_ctr_update(&snode->app_stats_rx_from_node,clear, 0);
stats_ctr_update(&snode->sink_pkt_tx,clear, 0);
stats_ctr_update(&snode->sink_pkt_rx,clear, 0);
stats_ctr_update(&snode->sink_pkt_expected_rx,clear, 0);
}
stats_ctr_update(&stat->time_awake_mins,clear, 0);
stats_ctr_update(&stat->num_metrics_tx,clear, 1);
stats_ctr_update(&stat->num_stats_tx,clear, 1);
stats_ctr_update(&stat->num_pkts_tx_succeeded,clear, 0);
stats_ctr_update(&stat->num_pkts_rx,clear, 0);
stats_ctr_update(&stat->num_pkts_dropped,clear, 0);
stats_ctr_update(&stat->num_pkts_tx_failed,clear, 0);
stats_ctr_update(&stat->num_pkts_crc_error,clear, 0);
if (clear) stat->rebooted = 0;
}
void clear_buf(buf_t** buf)
{
buf_free(*buf);
(*buf) = buf_new();
}
/**** Running Tests ****/
/*
static void
compare_link_quality(sympathy_node_info_t* stat)
{
float quality;
int i;
for (i = 0; i < stat->num_neighbors; i++) {
if (stat->neighbor_info[i].sim_link_quality < 0) {
elog(LOG_ERR, "ERROR Couldnt get simulation link quality for nodes %d -> %d\n",
stat->addr, stat->neighbors[i].node_id);
continue;
}
quality = 100 * (stat->neighbors[i].quality/255);
elog(LOG_DEBUG(1), "sim-link: %f, reported: %d, diff: %f\n",
stat->neighbor_info[i].sim_link_quality,
quality,
abs_float(stat->neighbor_info[i].sim_link_quality -
quality));
}
}
*/
/* application-specific tests */
static int
check_insufficient_msgs(int msgs_have, int msgs_expected, int msg_reception_percent)
{
return ((msgs_have < (msgs_expected * msg_reception_percent)/100)
|| ((msgs_have == 0) && (msgs_expected > 0)));
}
/* Check if node got requests */
static
int received_requests(sympathy_node_app_info_t* snode, buf_t* fault_buf)
{
if (check_insufficient_msgs(snode->node_num_pkts_rx.agg_prev_epoch,
snode->sink_pkt_tx.agg_prev_epoch, snode->pkt_reception_percent))
{
bufprintf(fault_buf,"\t0x%x: Num reqs node rx/Num reqs sink tx: %d/%d\n",
S_CODE(S_COMP_RX_REQS), snode->node_num_pkts_rx.agg_prev_epoch,
snode->sink_pkt_tx.agg_prev_epoch);
return 0;
}
return 1;
}
/* Compared to #pkts sink is expecting, is node sending
* sufficient responses */
static
int comp_tx_data(sympathy_node_app_info_t* snode, buf_t* fault_buf)
{
if (check_insufficient_msgs(snode->node_num_pkts_tx.agg_prev_epoch,
snode->sink_pkt_expected_rx.agg_prev_epoch, snode->pkt_reception_percent))
{
bufprintf(fault_buf,
"\tComp tx/ sink expected: %d/%d\n",
snode->node_num_pkts_tx.agg_prev_epoch,
snode->sink_pkt_expected_rx.agg_prev_epoch);
return 0;
}
return 1;
}
static
int node_tx_metrics(sympathy_node_info_t* stat)
{
return (stat->num_metrics_tx.agg_prev_epoch > 0);
}
int received_data(stats_ctr_t* pkts_rx, char* type, buf_t* fault_buf)
{
// NR why dont this work??
// int x = get_minutes_since_event(&pkts_rx->last_updated);
// if (x >= (EPOCH_MSEC/60000)) {
if (pkts_rx->agg_prev_epoch == 0) {
bufprintf(fault_buf, "\t%s: Num pkts rx: %d(%d)\n",
type, pkts_rx->ctr, pkts_rx->agg_prev_epoch);
return 0;
}
return 1;
}
/* Check how much data node tx compared to requests rx */
static
int receiving_data_node_tx(stats_ctr_t* pkts_sink_rx,
stats_ctr_t* pkts_node_tx, buf_t* fault_buf, int pkt_reception_percent)
{
if (check_insufficient_msgs(pkts_sink_rx->agg_prev_epoch,
pkts_node_tx->agg_prev_epoch, pkt_reception_percent))
{
bufprintf(fault_buf,
"\tsink rx pkts/Node tx pkts: %d/%d\n",
pkts_sink_rx->agg_prev_epoch, pkts_node_tx->agg_prev_epoch);
return 0;
}
return 1;
}
static int check_errs_rx(sympathy_node_info_t* stat, stats_ctr_t* errors_rx, stats_ctr_t*
good_rx, buf_t* fault_buf)
{
if ((errors_rx->agg_prev_epoch > 0)
&& (errors_rx->agg_prev_epoch
>= (good_rx->agg_prev_epoch*PERCENT_GOOD_PACKETS_CONGESTION)/100))
{
bufprintf(fault_buf, "\tRx bad/good %d/%d\n",
errors_rx->agg_prev_epoch, good_rx->agg_prev_epoch);
return 1;
}
return 0;
}
static
int received_sufficient_data(stats_ctr_t* pkts_rx,
stats_ctr_t* expected_pkts_rx, buf_t* fault_buf, int pkt_reception_percent)
{
if (check_insufficient_msgs(pkts_rx->agg_prev_epoch,
expected_pkts_rx->agg_prev_epoch, pkt_reception_percent))
{
bufprintf(fault_buf, "\tSink rx/Expected to rx: %d/%d\n",
pkts_rx->agg_prev_epoch, expected_pkts_rx->agg_prev_epoch);
elog(LOG_DEBUG(1), "CHECK fault-buf: %s\n", fault_buf->buf);
return 0;
}
return 1;
}
/* For this test, we have to measure the exact time since
* the sink received a packet from the node */
static int received_some_pkts_from_node(sympathy_node_info_t* stat)
{
return event_valid(&stat->packet.last_updated);
//int x = get_minutes_since_event(&stat->packet.last_updated);
//elog(LOG_DEBUG(1), "CHECK mins-since rx packet: %x\n",
//x);
//if (x >= (EPOCH_MSEC/60000)) return 0;
//return 1;
}
#ifdef USE_BAYES
int received_non_symp_app_pkts_from_node(sympathy_node_info_t* stat)
{
return (stat->tos_packets[SNON_ROUTING_PKT] > 0);
}
#endif
/**** General Testing Framework ***/
static
void find_nodes_with_same_next_hop(sympathy_node_info_t* stat)
{
int i;
stat->num_with_same_next_hop = 0;
/* Find other nodes with the same next-hop as this node */
for (i = 0; i < sink.num_srcs; i++)
{
if (sink.status_srcs[i].addr != stat->addr)
{
if ((stat->next_hop.next_hop > 0)
&& (sink.status_srcs[i].next_hop.next_hop == stat->next_hop.next_hop)
&& (sink.status_srcs[i].next_hop.sink == stat->next_hop.sink)
&& (route_valid(&sink.status_srcs[i])))
{
bufprintf(stat->topology_info, ", %d ", sink.status_srcs[i].addr);
stat->nodes_with_same_next_hop[stat->num_with_same_next_hop] =
sink.status_srcs[i].addr;
if (sink.status_srcs[i].failure_type > SFL_OK) {
bufprintf(stat->topology_info, "(root-cause=%s)",
decode_root_cause(sink.status_srcs[i].failure_type,
sink.status_srcs[i].addr));
}
stat->num_with_same_next_hop++;
}
}
}
if (stat->num_with_same_next_hop) {
bufprintf(stat->topology_info, "have the same next-hop(%d)!\n",
stat->next_hop.next_hop);
}
}
/*** Global Functions ***/
int check_passed(int error_events, int code)
{
return((error_events & S_CODE(code)) == 0);
}
int call_track_lost_nodes(void* data, int interval, g_event_t* event)
{
int i;
track_lost_nodes();
/* Increment the window, and then clear all the counters for this current
* window */
inc_mod((uint16_t *) &sink.window, 1, TRACK_FAILURE_WINDOW_SIZE);
sink.metric_pd++;
/* Have to update_counters AFTER incrementing sink.window! */
for (i = 0; i < sink.num_srcs; i++)
{
update_counters(&sink.status_srcs[i], 1);
if (sink.status_srcs[i].failure_type > SFL_OK) {
sympathy_emview_text(&sink.status_srcs[i]);
}
}
g_status_dev_notify(sink.metrics_status);
return EVENT_RENEW;
}
/* These are in no specific order */
static
void step1_run_tests(sympathy_node_info_t* stat)
{
if (!route_to_sink(stat)) {
log_failed(&stat->error_events,S_ROUTE_TO_SINK);
}
if (!node_has_neighbors(stat)) {
log_failed(&stat->error_events,S_NEIGHBORS);
}
/* Check if node received mostly good packets from other nodes */
if (check_errs_rx(stat, &stat->num_pkts_crc_error, &stat->num_pkts_rx, stat->fault_buf)) {
log_failed(&stat->error_events, S_NO_COLLISIONS);
stat->congestion_detected = 1;
}
if (!received_data(&stat->metrics_rx, "data", stat->fault_buf)) {
log_failed(&stat->error_events, S_RX_DATA_THIS_PD);
}
if (!received_sufficient_data(&stat->metrics_rx,
&sink.expected_num_sympathy_metrics, stat->fault_buf,
SMSG_RECEPTION_THRESH_DEFAULT)) {
elog(LOG_DEBUG(1), "CHECK node %d didnt rx suff data!\n",
stat->addr);
log_failed(&stat->error_events, S_RX_SUFFICIENT_DATA);
}
if (!received_data(&stat->sympathy_stats_rx, "stats", stat->fault_buf)) {
log_failed(&stat->error_events, S_RX_STATS);
}
if (!node_tx_metrics(stat)) log_failed(&stat->error_events, S_COMP_TX_DATA);
if (!receiving_data_node_tx(&stat->sympathy_stats_rx,
&stat->num_stats_tx, stat->fault_buf, SMSG_RECEPTION_THRESH_DEFAULT))
{
log_failed(&stat->error_events,S_RX_STATS_COMP_TX);
}
if (!received_some_pkts_from_node(stat))
{
log_failed(&stat->error_events,S_RX_SOME_PKTS_FROM_NODE);
}
if (!node_heard_from(stat))
{
log_failed(&stat->error_events,S_NODE_HEARD_FROM);
}
}
static void step1_check_component(sympathy_node_app_info_t* snode)
{
/* If we haven't received app metrics, then we cant categorize! */
if (!received_data(&snode->app_stats_rx_from_node, "stats", snode->fault_buf))
{
log_failed(&snode->error_events, S_RX_STATS);
}
if (!received_requests(snode, snode->fault_buf))
{
log_failed(&snode->error_events, S_COMP_RX_REQS);
}
if (!comp_tx_data(snode, snode->fault_buf))
{
log_failed(&snode->error_events, S_COMP_TX_DATA);
}
if (!received_sufficient_data(&snode->sink_pkt_rx,
&snode->sink_pkt_expected_rx, snode->fault_buf, snode->pkt_reception_percent))
{
log_failed(&snode->error_events, S_RX_SUFFICIENT_DATA);
}
if (!received_data(&snode->sink_pkt_rx, "data", snode->fault_buf))
{
log_failed(&snode->error_events, S_RX_DATA_THIS_PD);
}
}
// Check for failures as much as is possible - so we use the receipt
// of metrics to determine if we have received sufficient data from
// a node. We will then go on to check the components.
static
int step2_set_failure(int error_events)
{
if (!check_passed(error_events, S_RX_SOME_PKTS_FROM_NODE)) {
return SFL_NO_DATA;
}
else if (!check_passed(error_events, S_RX_SUFFICIENT_DATA)) {
return SFL_INSUFFICIENT_DATA;
}
return SFL_OK;
}
/* Returns failure for system, and sets root-cause */
static
int step3_root_cause_system_failure(sympathy_node_info_t* stat)
{
int failure = SFL_NO_DATA;
check_sink_collisions();
if (!node_has_neighbors(stat)) {
stat->failure_root_cause = SRC_NO_NEIGHBORS;
}
else if (!node_heard_from(stat)) {
stat->failure_root_cause = SRC_NOBODY_CLAIMS_SINK_AS_NEIGHBOR;
}
else if (!route_to_sink(stat)) {
stat->failure_root_cause = SRC_NO_ROUTE;
}
else failure = SFL_OK;
elog(LOG_DEBUG(1), "CHECK system failure = %d, root-cause: %d\n",
failure, stat->failure_root_cause);
return failure;
}
static
int step3_root_cause_failure(sympathy_node_info_t* stat, int error_events,
int test_events)
{
if (test_events) check_events(stat);
if (stat->rebooted) return SRC_NODE_REBOOTED;
else if ((!check_passed(error_events, S_RX_SOME_PKTS_FROM_NODE))
&& (!check_passed(stat->error_events, S_NODE_HEARD_FROM))) {
return SRC_NODE_FAILED;
}
else if (!check_passed(stat->error_events, S_NEIGHBORS)) {
return SRC_NO_NEIGHBORS;
}
else if (!check_passed(stat->error_events, S_ROUTE_TO_SINK)) {
return SRC_NO_ROUTE;
}
/* If the sink has'nt received statistics from this component,
* then sympathy can't do anything, and just assumes that the
* sink is not receiving data sent by the node. */
else if (!check_passed(error_events, S_RX_STATS)) {
return SRC_BAD_PATH_TO_SINK;
}
/* maybe its beacuse the component is not receiving the
* requests */
else if (!check_passed(error_events, S_COMP_RX_REQS)) {
return SRC_BAD_PATH_TO_NODE;
}
/* OW because the node is not transmitting data in response */
else if (!check_passed(error_events, S_COMP_TX_DATA)) {
return SRC_BAD_NODE_TRANSMIT;
}
return SRC_BAD_PATH_TO_SINK;
}
/* This func is only called if the node has a failure to begin with */
static
void step4_localize_failure(sympathy_node_info_t* stat)
{
sympathy_node_info_t* curr_stat = NULL;
sympathy_node_info_t* sink_stat = find_status_ptr(my_node_id);
int iter = 0;
stat->failure_localization = S_SELF;
find_nodes_with_same_next_hop(stat);
/* If the sink has a failure, then all failures are localized
* to the sink - other than the sink's failure, which is localized to
* itself. */
if (stat->addr == my_node_id) {
stat->failure_localization = S_SELF;
return;
}
/* If the node rebooted, nothing can explain that along the path */
else if (stat->rebooted) {
stat->failure_localization = S_SELF;
return;
}
/* If the sink has a failure, or the node's failure is just a communication
* issue and the sink is experiencing congestion, then localize it to the
* sink */
else if ((sink_stat->failure_type > SFL_OK)
|| ((stat->failure_root_cause < SRC_NO_NEIGHBORS)
&& (sink_stat->congestion_detected))) {
stat->failure_localization = S_SINK;
stat->source_node_failure = my_node_id;
return;
}
/* Otherwise we try to find the source of the failure somewhere in the
* network. Even if the route is not valid, we still use it as an indicator
* for fault localization. This is a reasonable thing to do because the
* route is probably invalid as a result of the fault. */
//else if (route_valid(stat)) {
else {
curr_stat = stat;
while ((curr_stat = iter_next_hop(curr_stat, &iter,
curr_stat->next_hop.sink))) {
/* If this node along the path has no root-caused failure
* then it cannot be the source of the node's failure */
if (!(curr_stat->congestion_detected
|| (curr_stat->failure_type > SFL_OK))) continue;
/* If we find a worse root-cause on a node closer to the sink
* (worse defined by ordering of root-causes, then that node
* is the source of the current node's problems.
* There are some exceptions to this ordering.
* If the current node's failure is less critical than
* no-neighbors, then it can be explained by any failure
* downstream from it - NOT just one that is "worse" */
if ((curr_stat->failure_root_cause >= stat->failure_root_cause)
|| (stat->failure_root_cause < SRC_NO_NEIGHBORS)) {
stat->failure_localization = S_PATH;
stat->source_node_failure = curr_stat->addr;
}
/* If the next hop isn't valid, we can't guage the rest of
* the route, so we keep the current status */
if (!route_valid(curr_stat)) break;
}
}
/* If failure localized to self, and there is congestion at this node,
* then we will localize it to the path, but specify the node as itself */
if (stat->failure_localization == S_SELF) {
if (stat->congestion_detected) {
stat->source_node_failure = stat->addr;
stat->failure_localization = S_PATH;
}
}
return;
}
void track_lost_nodes()
{
int i, tmp_root_cause, j;
sympathy_node_app_info_t* snode;
sympathy_node_info_t* stat;
int notify = 0;
elog(LOG_DEBUG(1), "window: %d, metric-pd: %d\n", sink.window,
sink.metric_pd);
/* Only include nodes whom we have heard from in the past
* epoch - in analysis of other nodes */
for (i = 0; i < sink.num_srcs; i++)
{
stat = &sink.status_srcs[i];
/* Calculate values for last-epoch */
update_counters(stat, 0);
/* Clear previous values, which are stored until next calculation */
stat->error_events = 0;
clear_buf(&stat->fault_buf);
clear_buf(&stat->topology_info);
if (!received_data(&stat->metrics_rx, "data", NULL)) {
stat->metrics_valid = 0;
}
else stat->metrics_valid = 1;
stat->congestion_detected = 0;
}
/* Don't begin checking for failures until we have had
* TRACK_FAILURE_WINDOW_SIZE metrics periods */
if (sink.metric_pd < TRACK_FAILURE_WINDOW_SIZE) return;
/* Set the failure category for all nodes */
for (i = 0; i < sink.num_srcs; i++)
{
stat = &sink.status_srcs[i];
tmp_root_cause = stat->failure_root_cause;
/* If the node is a sink, then look for system wide failures because the
* sink itself won't have any failures we want to detect */
if (stat->addr == my_node_id)
{
stat->failure_type = step3_root_cause_system_failure(stat);
}
/* Otherwise check failures on the node, sympathy and remaining components */
else
{
step1_run_tests(stat);
// NR make sympathy one of the components and check it as such?
stat->failure_type = step2_set_failure(stat->error_events);
/* Then check metrics for each application registered with sympathy */
for (j = 0; j < sink.num_apps_registered; j++) {
snode = &stat->node_app_info[j];
snode->error_events = 0;
clear_buf(&snode->fault_buf);
/* Run tests on component */
step1_check_component(snode);
snode->failure_type = step2_set_failure(snode->error_events);
/* If we have a failure from this node, then try to root-cause it */
if (snode->failure_type > SFL_OK) {
step3_root_cause_failure(stat, snode->error_events, 0);
}
/* If the current failure assignment for the node is an OK, and a component
* has a failure, then we set the current failure assignment
* to Insufficient data */
if ((stat->failure_type == SFL_OK) && (snode->failure_type > SFL_OK)) {
stat->failure_type = SFL_INSUFFICIENT_DATA;
stat->failure_root_cause = snode->failure_root_cause;
}
}
/* If we have a failure from this node, then try to root-cause it */
if (stat->failure_type > SFL_OK) {
stat->failure_root_cause =
step3_root_cause_failure(stat, stat->error_events, 1);
elog(LOG_DEBUG(1), "CHECK node %d had failure: %d, root-cause: %d\n",
stat->addr, stat->failure_type, stat->failure_root_cause);
}
}
/* If this root-cause is new, we note it and notify devices */
if (stat->failure_type > SFL_OK) {
if (tmp_root_cause != stat->failure_root_cause) {
stat->period_root_caused = sink.metric_pd;
notify = 1;
}
}
}
if (notify) {
g_status_dev_notify(sink.summary_status);
g_status_dev_notify(sink.fail_status);
}
#ifdef USE_BAYES
/* create bayes network based on routes */
bayes_classify_network(my_node_id);
#endif
/* Clear the agg_prev_epoch values, and update counters */
stats_ctr_update(&sink.expected_num_sympathy_metrics, 0, 0);
stats_ctr_update(&sink.expected_num_sympathy_metrics, 1, 0);
/* Diagnose the failure - either caused by congestion along
* the path, or what? */
for (j = 0; j < sink.num_srcs; j++) {
stat = &sink.status_srcs[j];
if (stat->failure_type > SFL_OK) step4_localize_failure(stat);
}
}
See more files for this project here