From df2638b5bf2c4abd2d4aacc78d0cf3a648af0045 Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Sat, 29 Jul 2017 21:43:40 +0100
Subject: [PATCH] added more detailed scores

---
 sciencebeam_gym/trainer/evaluator.py          | 33 +++++++++++++
 .../trainer/models/pix2pix/evaluate.py        | 46 ++++++++++++++++---
 .../trainer/models/pix2pix/pix2pix_model.py   |  2 +-
 3 files changed, 74 insertions(+), 7 deletions(-)

diff --git a/sciencebeam_gym/trainer/evaluator.py b/sciencebeam_gym/trainer/evaluator.py
index 230f940..978b1c8 100644
--- a/sciencebeam_gym/trainer/evaluator.py
+++ b/sciencebeam_gym/trainer/evaluator.py
@@ -56,6 +56,20 @@ def save_file(filename, data):
   with FileIO(filename, 'wb') as f:
     f.write(data)
 
+def precision_from_tp_fp(tp, fp):
+  return tp / (tp + fp)
+
+def recall_from_tp_fn(tp, fn):
+  return tp / (tp + fn)
+
+def f1_from_precision_recall(precision, recall):
+  return 2 * precision * recall / (precision + recall)
+
+def f1_from_tp_fp_fn(tp, fp, fn):
+  return f1_from_precision_recall(
+    precision_from_tp_fp(tp, fp),
+    recall_from_tp_fn(tp, fn)
+  )
 
 IMAGE_PREFIX = 'image_'
 
@@ -106,6 +120,10 @@ class Evaluator(object):
 
   def _add_evaluation_result_fetches(self, fetches, tensors):
     if tensors.evaluation_result:
+      fetches['tp'] = tensors.evaluation_result.tp
+      fetches['fp'] = tensors.evaluation_result.fp
+      fetches['fn'] = tensors.evaluation_result.fn
+      fetches['tn'] = tensors.evaluation_result.tn
       fetches['accuracy'] = tensors.evaluation_result.accuracy
       fetches['micro_f1'] = tensors.evaluation_result.micro_f1
     return fetches
@@ -114,6 +132,10 @@ class Evaluator(object):
     if accumulated_results is None:
       accumulated_results = []
     accumulated_results.append({
+      'tp': results['tp'],
+      'fp': results['fp'],
+      'fn': results['fn'],
+      'tn': results['tn'],
       'accuracy': results['accuracy'],
       'micro_f1': results['micro_f1'],
       'count': self.batch_size,
@@ -129,10 +151,21 @@ class Evaluator(object):
           global_step
         )
       )
+      tp = np.sum([r['tp'] for r in accumulated_results], axis=0)
+      fp = np.sum([r['fp'] for r in accumulated_results], axis=0)
+      fn = np.sum([r['fn'] for r in accumulated_results], axis=0)
+      tn = np.sum([r['tn'] for r in accumulated_results], axis=0)
+      f1 = f1_from_tp_fp_fn(tp.astype(float), fp, fn)
       scores_str = json.dumps({
         'global_step': global_step,
         'accuracy': float(np.mean([r['accuracy'] for r in accumulated_results])),
+        'tp': tp.tolist(),
+        'fp': fp.tolist(),
+        'fn': fn.tolist(),
+        'tn': tn.tolist(),
+        'f1': f1.tolist(),
         'micro_f1': float(np.mean([r['micro_f1'] for r in accumulated_results])),
+        'macro_f1': float(np.mean(f1)),
         'count': sum([r['count'] for r in accumulated_results])
       }, indent=2)
       with FileIO(scores_file, 'w') as f:
diff --git a/sciencebeam_gym/trainer/models/pix2pix/evaluate.py b/sciencebeam_gym/trainer/models/pix2pix/evaluate.py
index 7f1efe6..fc1ea75 100644
--- a/sciencebeam_gym/trainer/models/pix2pix/evaluate.py
+++ b/sciencebeam_gym/trainer/models/pix2pix/evaluate.py
@@ -11,10 +11,17 @@ EvaluationTensors = collections.namedtuple(
     "tp",
     "fp",
     "fn",
+    "tn",
+    "precision",
+    "recall",
+    "f1",
     "accuracy",
     "micro_precision",
     "micro_recall",
-    "micro_f1"
+    "micro_f1",
+    "macro_precision",
+    "macro_recall",
+    "macro_f1"
   ]
 )
 
@@ -24,28 +31,52 @@ def output_probabilities_to_class(outputs):
 def to_1d_vector(tensor):
   return tf.reshape(tensor, [-1])
 
+def precision_from_tp_fp(tp, fp):
+  return tp / (tp + fp)
+
+def recall_from_tp_fn(tp, fn):
+  return tp / (tp + fn)
+
+def f1_from_precision_recall(precision, recall):
+  return 2 * precision * recall / (precision + recall)
+
 def _evaluate_from_confusion_matrix(confusion, accuracy=None):
+  total = tf.reduce_sum(confusion)
   actual_p = tf.reduce_sum(confusion, axis=0)
   pred_p = tf.reduce_sum(confusion, axis=1)
   tp = tf.diag_part(confusion)
   fp = actual_p - tp
   fn = pred_p - tp
+  tn = total - tp - fp - fn
+  precision = precision_from_tp_fp(tp, fp)
+  recall = recall_from_tp_fn(tp, fn)
+  f1 = f1_from_precision_recall(precision, recall)
   total_tp = tf.reduce_sum(tp)
   total_fp = tf.reduce_sum(fp)
   total_fn = tf.reduce_sum(fn)
   # Note: micro averages (with equal weights) will lead to the same precision, recall, f1
-  micro_precision = total_tp / (total_tp + total_fp)
-  micro_recall = total_tp / (total_tp + total_fn)
-  micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall)
+  micro_precision = precision_from_tp_fp(total_tp, total_fp)
+  micro_recall = recall_from_tp_fn(total_tp, total_fn)
+  micro_f1 = f1_from_precision_recall(micro_precision, micro_recall)
+  macro_precision = tf.reduce_sum(precision)
+  macro_recall = tf.reduce_sum(recall)
+  macro_f1 = tf.reduce_sum(f1)
   return EvaluationTensors(
     confusion_matrix=confusion,
     tp=tp,
     fp=fp,
     fn=fn,
+    tn=tn,
+    precision=precision,
+    recall=recall,
+    f1=f1,
     accuracy=accuracy,
     micro_precision=micro_precision,
     micro_recall=micro_recall,
-    micro_f1=micro_f1
+    micro_f1=micro_f1,
+    macro_precision=macro_precision,
+    macro_recall=macro_recall,
+    macro_f1=macro_f1
   )
 
 def evaluate_predictions(labels, predictions, n_classes, has_unknown_class=False):
@@ -80,8 +111,11 @@ def evaluate_separate_channels(targets, outputs, has_unknown_class=False):
   )
 
 
-def evaluation_summary(evaluation_tensors):
+def evaluation_summary(evaluation_tensors, layer_labels):
   tf.summary.scalar("micro_precision", evaluation_tensors.micro_precision)
   tf.summary.scalar("micro_recall", evaluation_tensors.micro_recall)
   tf.summary.scalar("micro_f1", evaluation_tensors.micro_f1)
+  tf.summary.scalar("macro_f1", evaluation_tensors.macro_f1)
   tf.summary.scalar("accuracy", evaluation_tensors.accuracy)
+  for i, layer_label in enumerate(layer_labels):
+    tf.summary.scalar("f1_{}_{}".format(i, layer_label), evaluation_tensors.f1[i])
diff --git a/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model.py b/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model.py
index d5d32d0..a94615f 100644
--- a/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model.py
+++ b/sciencebeam_gym/trainer/models/pix2pix/pix2pix_model.py
@@ -342,7 +342,7 @@ class Model(object):
           has_unknown_class=self.use_unknown_class
         )
         tensors.evaluation_result = evaluation_result
-        evaluation_summary(evaluation_result)
+        evaluation_summary(evaluation_result, self.dimension_labels)
 
     tensors.global_step = pix2pix_model.global_step
     tensors.train = pix2pix_model.train
-- 
GitLab