Code Search for Developers
 
 
  

sympathy_analyze.c from EmStar at Krugle


Show sympathy_analyze.c syntax highlighted

/* ex: set tabstop=2 expandtab shiftwidth=2 softtabstop=2: */
/*
 *
 * Copyright (c) 2003 The Regents of the University of California.  All 
 * rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 * - Neither the name of the University nor the names of its
 *   contributors may be used to endorse or promote products derived
 *   from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS''
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

 /*
  *
  * Author: Nithya Ramanthan
  *
  */

/* NR todo: fix #root-detections, can sometimes use
 * outdated routes - dont just specify a node is a root
 * just because its route is outdated! */

#include <sympathy.h>
#include <sim/radio.h>

static void check_sink_collisions();

static int check_errs_rx(sympathy_node_info_t* stat, stats_ctr_t* errors_rx, stats_ctr_t* 
    good_rx, buf_t* fault_buf);

/* For now just check if the next-hop is in the
 * neighbor list */
static
void check_events(sympathy_node_info_t* stat)
{
  if (stat->next_hop.next_hop == 0) return;

  /* See if the next-hop selected is in the neighbor list */
  if ((find_neighbor(stat, (Saddr_t) stat->next_hop.next_hop))< 0) {
    bufprintf(stat->fault_buf, "ERROR: Next-hop: %d is not a neighbor!\n", stat->next_hop.next_hop);
  }
}

static
void log_failed(int* error_events, int code)
{
  *error_events |= S_CODE(code);
}

static 
int node_has_neighbors(sympathy_node_info_t* stat)
{
  return (stat->num_neighbors > 0);
}

static 
int route_to_sink(sympathy_node_info_t* stat)
{
  uint8_t i;

  /* If this is the sink, see if anybody has a route to the sink */
  if (stat->addr == my_node_id) {
    for (i = 0; i < sink.num_srcs; i++) {
      if (route_valid(&sink.status_srcs[i])) {
        if (sink.status_srcs[i].next_hop.sink == my_node_id) return 1;
      }
    }
  }

  /* Otherwise just check this node */
  else if (route_valid(stat))
  {
    return(stat->next_hop.sink == my_node_id);
  }

  return 0;
}

/* Node is claimed to have not been heard from IF: no nodes have it on their 
 * neighbor list AND they have current metrics */
static
int node_heard_from(sympathy_node_info_t* stat)
{
  uint8_t i, j;
  buf_t* neighbor_buf = buf_new();

  stat->num_heard_this_node = 0;

  /* Parse metrics to see if anybody claims node as a neighbor */
  for (i = 0; i < sink.num_srcs; i++)
  {
    if (sink.status_srcs[i].addr != stat->addr) 
    {
      for (j = 0; j < sink.status_srcs[i].num_neighbors; j++)
      {
	      if (sink.status_srcs[i].neighbors[j].node_id 
						== stat->addr) 
        {
          if (neighbors_valid(&sink.status_srcs[i]))
          {
            stat->num_heard_this_node++;
            bufprintf(neighbor_buf, "%d,", sink.status_srcs[i].addr); 
          }
        }
      }
    }
  }
#ifdef USE_BAYES
  if (stat->num_heard_this_node > stat->max_num_heard_this_node)
    stat->max_num_heard_this_node = stat->num_heard_this_node;
#endif

  if (neighbor_buf->len > 0) {
    bufprintf(stat->topology_info, "Num neighbors heard this node: %d {%s}\n", 
        stat->num_heard_this_node, neighbor_buf->buf);
  }
  else bufprintf(stat->topology_info, "NO NEIGHBORS heard this node!\n");
  buf_free(neighbor_buf);
  return (stat->num_heard_this_node > 0);
}


static 
void check_sink_collisions()
{
  sympathy_node_info_t *stat = find_status_ptr(my_node_id);
  if (check_errs_rx(stat, &stat->num_pkts_crc_error, &stat->num_pkts_rx, stat->fault_buf)) {
    log_failed(&stat->error_events, S_NO_COLLISIONS);
    stat->congestion_detected = 1;
  }
}

static void update_counters(sympathy_node_info_t* stat, uint8_t clear)
{
  int j;
  sympathy_node_app_info_t* snode;

  stats_ctr_update(&stat->metrics_rx,clear, 0);
  stats_ctr_update(&stat->sympathy_stats_rx,clear, 0);

  for (j = 0; j < NUM_TOS_PKT_TYPES; j++)
  {
    stats_ctr_update(&stat->tos_packets[j],clear, 0);
  }
  stats_ctr_update(&stat->errs_rx,clear, 0);

  for (j = 0; j < sink.num_apps_registered; j++)
  {
    snode = &stat->node_app_info[j];
    stats_ctr_update(&snode->node_num_pkts_rx,clear, 0);
    stats_ctr_update(&snode->node_send_failures,clear, 0); 
    stats_ctr_update(&snode->node_max_queue_occupancy,clear, 0);
    stats_ctr_update(&snode->node_num_pkts_dropped,clear, 0);
    stats_ctr_update(&snode->node_num_pkts_tx,clear, 1);
    stats_ctr_update(&snode->app_stats_rx_from_node,clear, 0);
    stats_ctr_update(&snode->sink_pkt_tx,clear, 0);
    stats_ctr_update(&snode->sink_pkt_rx,clear, 0);
    stats_ctr_update(&snode->sink_pkt_expected_rx,clear, 0);
  }
  stats_ctr_update(&stat->time_awake_mins,clear, 0);
  stats_ctr_update(&stat->num_metrics_tx,clear, 1);
  stats_ctr_update(&stat->num_stats_tx,clear, 1);
  stats_ctr_update(&stat->num_pkts_tx_succeeded,clear, 0);
  stats_ctr_update(&stat->num_pkts_rx,clear, 0);
  stats_ctr_update(&stat->num_pkts_dropped,clear, 0);
  stats_ctr_update(&stat->num_pkts_tx_failed,clear, 0);
  stats_ctr_update(&stat->num_pkts_crc_error,clear, 0);

  if (clear) stat->rebooted = 0;
}

void clear_buf(buf_t** buf)
{
  buf_free(*buf);
  (*buf) = buf_new();
}


/**** Running Tests ****/

/*
static void
compare_link_quality(sympathy_node_info_t* stat)
{
  float quality;
  int i;

  for (i = 0; i < stat->num_neighbors; i++) {
    if (stat->neighbor_info[i].sim_link_quality < 0) {
      elog(LOG_ERR, "ERROR Couldnt get simulation link quality for nodes %d -> %d\n", 
        stat->addr, stat->neighbors[i].node_id);
      continue;
    }
    quality = 100 * (stat->neighbors[i].quality/255);
    elog(LOG_DEBUG(1), "sim-link: %f, reported: %d, diff: %f\n", 
      stat->neighbor_info[i].sim_link_quality, 
      quality, 
      abs_float(stat->neighbor_info[i].sim_link_quality - 
        quality));
  }
}
*/

/* application-specific tests */
static int
check_insufficient_msgs(int msgs_have, int msgs_expected, int msg_reception_percent)
{
 return ((msgs_have < (msgs_expected * msg_reception_percent)/100)
     || ((msgs_have == 0) && (msgs_expected > 0)));
}

/* Check if node got requests */
static
int received_requests(sympathy_node_app_info_t* snode, buf_t* fault_buf)
{
  if (check_insufficient_msgs(snode->node_num_pkts_rx.agg_prev_epoch, 
        snode->sink_pkt_tx.agg_prev_epoch, snode->pkt_reception_percent))
  {
    bufprintf(fault_buf,"\t0x%x: Num reqs node rx/Num reqs sink tx: %d/%d\n", 
        S_CODE(S_COMP_RX_REQS), snode->node_num_pkts_rx.agg_prev_epoch, 
        snode->sink_pkt_tx.agg_prev_epoch);
    return 0;
  }
  return 1;
}

/* Compared to #pkts sink is expecting, is node sending 
 * sufficient responses */
static
int comp_tx_data(sympathy_node_app_info_t* snode, buf_t* fault_buf)
{
  if (check_insufficient_msgs(snode->node_num_pkts_tx.agg_prev_epoch,
        snode->sink_pkt_expected_rx.agg_prev_epoch, snode->pkt_reception_percent))
  {
    bufprintf(fault_buf, 
      "\tComp tx/ sink expected: %d/%d\n", 
       snode->node_num_pkts_tx.agg_prev_epoch, 
       snode->sink_pkt_expected_rx.agg_prev_epoch);
    return 0;
  }
  return 1;
}

static
int node_tx_metrics(sympathy_node_info_t* stat)
{
  return (stat->num_metrics_tx.agg_prev_epoch > 0);
}

int received_data(stats_ctr_t* pkts_rx, char* type, buf_t* fault_buf)
{
  //  NR why dont this work??
//  int x = get_minutes_since_event(&pkts_rx->last_updated);
//  if (x >= (EPOCH_MSEC/60000)) {
  if (pkts_rx->agg_prev_epoch == 0) {
    bufprintf(fault_buf, "\t%s: Num pkts rx: %d(%d)\n", 
        type, pkts_rx->ctr, pkts_rx->agg_prev_epoch);
    return 0;
  }
  return 1;
}

/* Check how much data node tx compared to requests rx */
static
int receiving_data_node_tx(stats_ctr_t* pkts_sink_rx, 
    stats_ctr_t* pkts_node_tx, buf_t* fault_buf, int pkt_reception_percent)
{
  if (check_insufficient_msgs(pkts_sink_rx->agg_prev_epoch, 
        pkts_node_tx->agg_prev_epoch, pkt_reception_percent))
  {
    bufprintf(fault_buf, 
      "\tsink rx pkts/Node tx pkts: %d/%d\n", 
      pkts_sink_rx->agg_prev_epoch, pkts_node_tx->agg_prev_epoch);
    return 0;
  }
  return 1;
}

static int check_errs_rx(sympathy_node_info_t* stat, stats_ctr_t* errors_rx, stats_ctr_t* 
    good_rx, buf_t* fault_buf)
{
  if ((errors_rx->agg_prev_epoch > 0) 
     && (errors_rx->agg_prev_epoch 
       >= (good_rx->agg_prev_epoch*PERCENT_GOOD_PACKETS_CONGESTION)/100))
  {
    bufprintf(fault_buf, "\tRx bad/good %d/%d\n",
      errors_rx->agg_prev_epoch, good_rx->agg_prev_epoch);
    return 1;
  }
  return 0;
}

static
int received_sufficient_data(stats_ctr_t* pkts_rx, 
    stats_ctr_t* expected_pkts_rx, buf_t* fault_buf, int pkt_reception_percent)
{
  if (check_insufficient_msgs(pkts_rx->agg_prev_epoch, 
        expected_pkts_rx->agg_prev_epoch, pkt_reception_percent))
  {
    bufprintf(fault_buf, "\tSink rx/Expected to rx: %d/%d\n",
          pkts_rx->agg_prev_epoch, expected_pkts_rx->agg_prev_epoch);
    elog(LOG_DEBUG(1), "CHECK fault-buf: %s\n", fault_buf->buf);
    return 0;
  }

  return 1;
}

/* For this test, we have to measure the exact time since
 * the sink received a packet from the node */
static int received_some_pkts_from_node(sympathy_node_info_t* stat) 
{
  return event_valid(&stat->packet.last_updated);
  //int x = get_minutes_since_event(&stat->packet.last_updated);
  //elog(LOG_DEBUG(1), "CHECK mins-since rx packet: %x\n",
     //x);
  //if (x >= (EPOCH_MSEC/60000)) return 0;
  //return 1;
}

#ifdef USE_BAYES
int received_non_symp_app_pkts_from_node(sympathy_node_info_t* stat)
{
  return (stat->tos_packets[SNON_ROUTING_PKT] > 0);
}
#endif

/**** General Testing Framework ***/
static
void find_nodes_with_same_next_hop(sympathy_node_info_t* stat)
{
  int i;

  stat->num_with_same_next_hop = 0;

	/* Find other nodes with the same next-hop as this node */
	for (i = 0; i < sink.num_srcs; i++)
	{
		if (sink.status_srcs[i].addr != stat->addr) 
    {
      if ((stat->next_hop.next_hop > 0)
        && (sink.status_srcs[i].next_hop.next_hop == stat->next_hop.next_hop)
          && (sink.status_srcs[i].next_hop.sink == stat->next_hop.sink)
          && (route_valid(&sink.status_srcs[i])))
      {
        bufprintf(stat->topology_info, ", %d ", sink.status_srcs[i].addr);
        stat->nodes_with_same_next_hop[stat->num_with_same_next_hop] = 
          sink.status_srcs[i].addr;
        if (sink.status_srcs[i].failure_type > SFL_OK) {
          bufprintf(stat->topology_info, "(root-cause=%s)", 
              decode_root_cause(sink.status_srcs[i].failure_type,
                sink.status_srcs[i].addr));
        }
        stat->num_with_same_next_hop++;
      }
    }
  }

  if (stat->num_with_same_next_hop) {
    bufprintf(stat->topology_info, "have the same next-hop(%d)!\n",
       stat->next_hop.next_hop);
  }
}

/*** Global Functions ***/

int check_passed(int error_events, int code) 
{
  return((error_events & S_CODE(code)) == 0);
}

int call_track_lost_nodes(void* data, int interval, g_event_t* event)
{
  int i;
  track_lost_nodes();

  /* Increment the window, and then clear all the counters for this current
   * window */
  inc_mod((uint16_t *) &sink.window, 1, TRACK_FAILURE_WINDOW_SIZE);
	sink.metric_pd++;

  /* Have to update_counters AFTER incrementing sink.window! */
  for (i = 0; i < sink.num_srcs; i++) 
  {
    update_counters(&sink.status_srcs[i], 1);
    if (sink.status_srcs[i].failure_type > SFL_OK) {
      sympathy_emview_text(&sink.status_srcs[i]);
    }
  }
  g_status_dev_notify(sink.metrics_status);
	return EVENT_RENEW;
}

/* These are in no specific order */
static
void step1_run_tests(sympathy_node_info_t* stat)
{
  if (!route_to_sink(stat)) {
    log_failed(&stat->error_events,S_ROUTE_TO_SINK);
  }

  if (!node_has_neighbors(stat)) {
    log_failed(&stat->error_events,S_NEIGHBORS);
  }
  
  /* Check if node received mostly good packets from other nodes */
  if (check_errs_rx(stat, &stat->num_pkts_crc_error, &stat->num_pkts_rx, stat->fault_buf)) {
    log_failed(&stat->error_events, S_NO_COLLISIONS);
    stat->congestion_detected = 1;
  }

  if (!received_data(&stat->metrics_rx, "data", stat->fault_buf)) {
    log_failed(&stat->error_events, S_RX_DATA_THIS_PD);
  }

  if (!received_sufficient_data(&stat->metrics_rx, 
        &sink.expected_num_sympathy_metrics, stat->fault_buf, 
        SMSG_RECEPTION_THRESH_DEFAULT)) {
    elog(LOG_DEBUG(1), "CHECK node %d didnt rx suff data!\n", 
        stat->addr);
    log_failed(&stat->error_events, S_RX_SUFFICIENT_DATA);
  }

  if (!received_data(&stat->sympathy_stats_rx, "stats", stat->fault_buf)) {
    log_failed(&stat->error_events, S_RX_STATS);
  }

  if (!node_tx_metrics(stat)) log_failed(&stat->error_events, S_COMP_TX_DATA);

  if (!receiving_data_node_tx(&stat->sympathy_stats_rx, 
        &stat->num_stats_tx, stat->fault_buf, SMSG_RECEPTION_THRESH_DEFAULT))
  {
    log_failed(&stat->error_events,S_RX_STATS_COMP_TX);
  }
  if (!received_some_pkts_from_node(stat))
  {
    log_failed(&stat->error_events,S_RX_SOME_PKTS_FROM_NODE);
  }
  if (!node_heard_from(stat))
  {
    log_failed(&stat->error_events,S_NODE_HEARD_FROM);
  }
}

static void step1_check_component(sympathy_node_app_info_t* snode)
{
  /* If we haven't received app metrics, then we cant categorize! */
  if (!received_data(&snode->app_stats_rx_from_node, "stats", snode->fault_buf)) 
  {
    log_failed(&snode->error_events, S_RX_STATS);
  }
  if (!received_requests(snode, snode->fault_buf)) 
  {
    log_failed(&snode->error_events, S_COMP_RX_REQS);
  }
  if (!comp_tx_data(snode, snode->fault_buf)) 
  {
    log_failed(&snode->error_events, S_COMP_TX_DATA);
  }
  if (!received_sufficient_data(&snode->sink_pkt_rx, 
        &snode->sink_pkt_expected_rx, snode->fault_buf, snode->pkt_reception_percent))
  {
    log_failed(&snode->error_events, S_RX_SUFFICIENT_DATA);
  }

  if (!received_data(&snode->sink_pkt_rx, "data", snode->fault_buf)) 
  {
    log_failed(&snode->error_events, S_RX_DATA_THIS_PD);
  }
}

// Check for failures as much as is possible - so we use the receipt
// of metrics to determine if we have received sufficient data from
// a node. We will then go on to check the components.
static
int step2_set_failure(int error_events)
{
  if (!check_passed(error_events, S_RX_SOME_PKTS_FROM_NODE)) {
    return SFL_NO_DATA;
  }
  else if (!check_passed(error_events, S_RX_SUFFICIENT_DATA)) {
    return SFL_INSUFFICIENT_DATA;
  }
  return SFL_OK;
}

/* Returns failure for system, and sets root-cause */
static
int step3_root_cause_system_failure(sympathy_node_info_t* stat)
{
  int failure = SFL_NO_DATA;
  check_sink_collisions();

  if (!node_has_neighbors(stat)) {
    stat->failure_root_cause = SRC_NO_NEIGHBORS;
  }
  else if (!node_heard_from(stat)) {
    stat->failure_root_cause = SRC_NOBODY_CLAIMS_SINK_AS_NEIGHBOR;
  }
  else if (!route_to_sink(stat)) {
    stat->failure_root_cause = SRC_NO_ROUTE;
  }
  else failure = SFL_OK;
  elog(LOG_DEBUG(1), "CHECK system failure = %d, root-cause: %d\n",
      failure, stat->failure_root_cause);
  return failure;
}

static
int step3_root_cause_failure(sympathy_node_info_t* stat, int error_events, 
    int test_events)
{
  if (test_events) check_events(stat);

  if (stat->rebooted) return SRC_NODE_REBOOTED;

  else if ((!check_passed(error_events, S_RX_SOME_PKTS_FROM_NODE))
       && (!check_passed(stat->error_events, S_NODE_HEARD_FROM))) {
    return SRC_NODE_FAILED;
  }

  else if (!check_passed(stat->error_events, S_NEIGHBORS)) {
    return SRC_NO_NEIGHBORS;
  }

  else if (!check_passed(stat->error_events, S_ROUTE_TO_SINK)) {
    return SRC_NO_ROUTE;
  }

  /* If the sink has'nt received statistics from this component,
   * then sympathy can't do anything, and just assumes that the
   * sink is not receiving data sent by the node. */
  else if (!check_passed(error_events, S_RX_STATS)) {
    return SRC_BAD_PATH_TO_SINK;
  }

  /* maybe its beacuse the component is not receiving the
   * requests */
  else if (!check_passed(error_events, S_COMP_RX_REQS)) {
    return SRC_BAD_PATH_TO_NODE;
  }
  /* OW because the node is not transmitting data in response */
  else if (!check_passed(error_events, S_COMP_TX_DATA)) {
   return SRC_BAD_NODE_TRANSMIT;
  }

  return SRC_BAD_PATH_TO_SINK;
}

/* This func is only called if the node has a failure to begin with */
static
void step4_localize_failure(sympathy_node_info_t* stat)
{
  sympathy_node_info_t* curr_stat = NULL;
  sympathy_node_info_t* sink_stat = find_status_ptr(my_node_id);
  int iter = 0;

  stat->failure_localization = S_SELF;
  find_nodes_with_same_next_hop(stat);

  /* If the sink has a failure, then all failures are localized
   * to the sink - other than the sink's failure, which is localized to
   * itself. */
  if (stat->addr == my_node_id) {
    stat->failure_localization = S_SELF;
    return;
  }

  /* If the node rebooted, nothing can explain that along the path */
  else if (stat->rebooted) {
    stat->failure_localization = S_SELF;
    return;
  }

  /* If the sink has a failure, or the node's failure is just a communication
   * issue and the sink is experiencing congestion, then localize it to the
   * sink */
  else if ((sink_stat->failure_type > SFL_OK)
            || ((stat->failure_root_cause < SRC_NO_NEIGHBORS)
                && (sink_stat->congestion_detected))) {
     stat->failure_localization = S_SINK;
     stat->source_node_failure = my_node_id;
     return;
   }

  /* Otherwise we try to find the source of the failure somewhere in the
   * network. Even if the route is not valid, we still use it as an indicator
   * for fault localization. This is a reasonable thing to do because the
   * route is probably invalid as a result of the fault. */
  //else if (route_valid(stat)) {
  else {
    curr_stat = stat;
    while ((curr_stat = iter_next_hop(curr_stat, &iter, 
             curr_stat->next_hop.sink))) {

      /* If this node along the path has no root-caused failure
       * then it cannot be the source of the node's failure */
      if (!(curr_stat->congestion_detected 
            || (curr_stat->failure_type > SFL_OK))) continue;

      /* If we find a worse root-cause on a node closer to the sink 
       * (worse defined by ordering of root-causes, then that node
       * is the source of the current node's problems.
       * There are some exceptions to this ordering.
       * If the current node's failure is less critical than
       * no-neighbors, then it can be explained by any failure
       * downstream from it - NOT just one that is "worse" */
      if ((curr_stat->failure_root_cause >= stat->failure_root_cause) 
             || (stat->failure_root_cause < SRC_NO_NEIGHBORS)) {
        stat->failure_localization = S_PATH;
        stat->source_node_failure = curr_stat->addr;
      }

      /* If the next hop isn't valid, we can't guage the rest of 
       * the route, so we keep the current status */
      if (!route_valid(curr_stat)) break;
    }
  }

  /* If failure localized to self, and there is congestion at this node,
   * then we will localize it to the path, but specify the node as itself */
  if (stat->failure_localization == S_SELF) {
    if (stat->congestion_detected) {
        stat->source_node_failure = stat->addr;
        stat->failure_localization = S_PATH;
    }
  }

  return;
}

void track_lost_nodes()
{
  int i, tmp_root_cause, j;
  sympathy_node_app_info_t* snode;
  sympathy_node_info_t* stat;
  int notify = 0;

  elog(LOG_DEBUG(1), "window: %d, metric-pd: %d\n", sink.window,
      sink.metric_pd);

  /* Only include nodes whom we have heard from in the past
   * epoch - in analysis of other nodes  */
  for (i = 0; i < sink.num_srcs; i++)
  {
    stat = &sink.status_srcs[i];

    /* Calculate values for last-epoch */
    update_counters(stat, 0);

    /* Clear previous values, which are stored until next calculation */
    stat->error_events = 0;
    clear_buf(&stat->fault_buf);
    clear_buf(&stat->topology_info);

    if (!received_data(&stat->metrics_rx, "data", NULL))  {
      stat->metrics_valid = 0;
    }
    else stat->metrics_valid = 1;
    stat->congestion_detected = 0;
  }

  /* Don't begin checking for failures until we have had 
   * TRACK_FAILURE_WINDOW_SIZE metrics periods */
  if (sink.metric_pd < TRACK_FAILURE_WINDOW_SIZE) return;

  /* Set the failure category for all nodes */
  for (i = 0; i < sink.num_srcs; i++)
  {
    stat = &sink.status_srcs[i];
    tmp_root_cause = stat->failure_root_cause;

    /* If the node is a sink, then look for system wide failures because the
     * sink itself won't have any failures we want to detect */
    if (stat->addr == my_node_id)
    {
      stat->failure_type = step3_root_cause_system_failure(stat);
    }

    /* Otherwise check failures on the node, sympathy and remaining components */
    else
    {
      step1_run_tests(stat);

      // NR make sympathy one of the components and check it as such?
      stat->failure_type = step2_set_failure(stat->error_events);

      /* Then check metrics for each application registered with sympathy */
      for (j = 0; j < sink.num_apps_registered; j++) {
        snode = &stat->node_app_info[j];
        snode->error_events = 0;
        clear_buf(&snode->fault_buf);
        
        /* Run tests on component */
        step1_check_component(snode);
        snode->failure_type = step2_set_failure(snode->error_events);

        /* If we have a failure from this node, then try to root-cause it */
        if (snode->failure_type > SFL_OK) {
          step3_root_cause_failure(stat, snode->error_events, 0);
        }

        /* If the current failure assignment for the node is an OK, and a component
         * has a failure, then we set the current failure assignment
         * to Insufficient data */
        if ((stat->failure_type == SFL_OK) && (snode->failure_type > SFL_OK)) {
          stat->failure_type = SFL_INSUFFICIENT_DATA;
          stat->failure_root_cause = snode->failure_root_cause;
        }
      }

      /* If we have a failure from this node, then try to root-cause it */
      if (stat->failure_type > SFL_OK) {
        stat->failure_root_cause = 
          step3_root_cause_failure(stat, stat->error_events, 1);
        elog(LOG_DEBUG(1), "CHECK node %d had failure: %d, root-cause: %d\n", 
            stat->addr, stat->failure_type, stat->failure_root_cause);
      }
    }

    /* If this root-cause is new, we note it and notify devices */
    if (stat->failure_type > SFL_OK) {
      if (tmp_root_cause != stat->failure_root_cause) {
        stat->period_root_caused = sink.metric_pd;
        notify = 1;
      }
    }
  }

  if (notify) {
    g_status_dev_notify(sink.summary_status);
    g_status_dev_notify(sink.fail_status);
  }

#ifdef USE_BAYES
  /* create bayes network based on routes */
  bayes_classify_network(my_node_id);
#endif

  /* Clear the agg_prev_epoch values, and update counters */
  stats_ctr_update(&sink.expected_num_sympathy_metrics, 0, 0);
  stats_ctr_update(&sink.expected_num_sympathy_metrics, 1, 0);

  /* Diagnose the failure - either caused by congestion along
   * the path, or what? */
  for (j = 0; j < sink.num_srcs; j++) {
    stat = &sink.status_srcs[j];
    if (stat->failure_type > SFL_OK) step4_localize_failure(stat);
  }
}




See more files for this project here

EmStar

EmStar is a software system for developing and deploying wireless sensor networks involving Linux-based platforms. As the wireless sensor network community has attempted to deploy more complex designs---large-scale, long-lived systems that need self-organization and adaptivity---a number of difficult software design issues have arisen. Advances in software design have not kept pace with the capabilities of hardware. This is because designing for an adaptive, efficient, and useful sensor network has turned out to be surprisingly complex and difficult. EmStar is a Linux-based software framework, whose goal is to dramatically reduce this complexity, enabling work to be shared and reused, and simplifying and speeding the design of new sensor network applications.

Project homepage: http://cvs.cens.ucla.edu/emstar/
Programming language(s): C,Shell Script
License: other

  bayes/
    lib/
      libnetica.a
    src/
      Netica.h
      NeticaEx.c
      NeticaEx.h
    bayes_classifier.c
    nodes.states
    training.prob
  include/
    sympathy.h
    sympathy_app.h
    sympathy_dev.h
    sympathy_routing.h
  libsympathy/
    sympathy_routing.c
  scripts/
    get_recent_status.pl
    get_throughput.pl
  testtabs/
    essjr.run
    essjr_sensor.run
    rr_mote.run
    rr_mote_ceiling.run
    sympathy.run
    sympathy_ceiling.sim
    sympathy_sim.sim
    sympathy_sim_small.sim
    sympathy_snoop.run
    sympathy_test.sim
  BUILD
  data.xml
  jr_loc_small
  sars.pl
  sympathy_analyze.c
  sympathy_battery.c
  sympathy_device.c
  sympathy_doc
  sympathy_emview.c
  sympathy_events.c
  sympathy_main.c
  sympathy_print_stats.c
  sympathy_status.c