001package io.prometheus.client;
002
003import io.prometheus.client.CKMSQuantiles.Quantile;
004
005import java.io.Closeable;
006import java.util.ArrayList;
007import java.util.Collections;
008import java.util.List;
009import java.util.Map;
010import java.util.SortedMap;
011import java.util.TreeMap;
012import java.util.concurrent.Callable;
013import java.util.concurrent.TimeUnit;
014
015/**
016 * {@link Summary} metrics and {@link Histogram} metrics can both be used to monitor distributions like latencies or request sizes.
017 * <p>
018 * An overview of when to use Summaries and when to use Histograms can be found on <a href="https://prometheus.io/docs/practices/histograms">https://prometheus.io/docs/practices/histograms</a>.
019 * <p>
020 * The following example shows how to measure latencies and request sizes:
021 *
022 * <pre>
023 * class YourClass {
024 *
025 *   private static final Summary requestLatency = Summary.build()
026 *       .name("requests_latency_seconds")
027 *       .help("request latency in seconds")
028 *       .register();
029 *
030 *   private static final Summary receivedBytes = Summary.build()
031 *       .name("requests_size_bytes")
032 *       .help("request size in bytes")
033 *       .register();
034 *
035 *   public void processRequest(Request req) {
036 *     Summary.Timer requestTimer = requestLatency.startTimer();
037 *     try {
038 *       // Your code here.
039 *     } finally {
040 *       requestTimer.observeDuration();
041 *       receivedBytes.observe(req.size());
042 *     }
043 *   }
044 * }
045 * </pre>
046 *
047 * The {@link Summary} class provides different utility methods for observing values, like {@link #observe(double)},
048 * {@link #startTimer()} and {@link Timer#observeDuration()}, {@link #time(Callable)}, etc.
049 * <p>
050 * By default, {@link Summary} metrics provide the {@code count} and the {@code sum}. For example, if you measure
051 * latencies of a REST service, the {@code count} will tell you how often the REST service was called,
052 * and the {@code sum} will tell you the total aggregated response time.
053 * You can calculate the average response time using a Prometheus query dividing {@code sum / count}.
054 * <p>
055 * In addition to {@code count} and {@code sum}, you can configure a Summary to provide quantiles:
056 *
057 * <pre>
058 * Summary requestLatency = Summary.build()
059 *     .name("requests_latency_seconds")
060 *     .help("Request latency in seconds.")
061 *     .quantile(0.5, 0.01)    // 0.5 quantile (median) with 0.01 allowed error
062 *     .quantile(0.95, 0.005)  // 0.95 quantile with 0.005 allowed error
063 *     // ...
064 *     .register();
065 * </pre>
066 *
067 * As an example, a 0.95 quantile of 120ms tells you that 95% of the calls were faster than 120ms, and 5% of the calls were slower than 120ms.
068 * <p>
069 * Tracking exact quantiles require a large amount of memory, because all observations need to be stored in a sorted list. Therefore, we allow an error to significantly reduce memory usage.
070 * <p>
071 * In the example, the allowed error of 0.005 means that you will not get the exact 0.95 quantile, but anything between the 0.945 quantile and the 0.955 quantile.
072 * <p>
073 * Experiments show that the {@link Summary} typically needs to keep less than 100 samples to provide that precision, even if you have hundreds of millions of observations.
074 * <p>
075 * There are a few special cases:
076 *
077 * <ul>
078 *   <li>You can set an allowed error of 0, but then the {@link Summary} will keep all observations in memory.</li>
079 *   <li>You can track the minimum value with {@code .quantile(0.0, 0.0)}.
080 *       This special case will not use additional memory even though the allowed error is 0.</li>
081 *   <li>You can track the maximum value with {@code .quantile(1.0, 0.0)}.
082 *       This special case will not use additional memory even though the allowed error is 0.</li>
083 * </ul>
084 *
085 * Typically, you don't want to have a {@link Summary} representing the entire runtime of the application,
086 * but you want to look at a reasonable time interval. {@link Summary} metrics implement a configurable sliding
087 * time window:
088 *
089 * <pre>
090 * Summary requestLatency = Summary.build()
091 *     .name("requests_latency_seconds")
092 *     .help("Request latency in seconds.")
093 *     .maxAgeSeconds(10 * 60)
094 *     .ageBuckets(5)
095 *     // ...
096 *     .register();
097 * </pre>
098 *
099 * The default is a time window of 10 minutes and 5 age buckets, i.e. the time window is 10 minutes wide, and
100 * we slide it forward every 2 minutes.
101 */
102public class Summary extends SimpleCollector<Summary.Child> implements Counter.Describable {
103
104  final List<Quantile> quantiles; // Can be empty, but can never be null.
105  final long maxAgeSeconds;
106  final int ageBuckets;
107
108  Summary(Builder b) {
109    super(b);
110    quantiles = Collections.unmodifiableList(new ArrayList<Quantile>(b.quantiles));
111    this.maxAgeSeconds = b.maxAgeSeconds;
112    this.ageBuckets = b.ageBuckets;
113    initializeNoLabelsChild();
114  }
115
116  public static class Builder extends SimpleCollector.Builder<Builder, Summary> {
117
118    private final List<Quantile> quantiles = new ArrayList<Quantile>();
119    private long maxAgeSeconds = TimeUnit.MINUTES.toSeconds(10);
120    private int ageBuckets = 5;
121
122    /**
123     * The class JavaDoc for {@link Summary} has more information on {@link #quantile(double, double)}.
124     * @see Summary
125     */
126    public Builder quantile(double quantile, double error) {
127      if (quantile < 0.0 || quantile > 1.0) {
128        throw new IllegalArgumentException("Quantile " + quantile + " invalid: Expected number between 0.0 and 1.0.");
129      }
130      if (error < 0.0 || error > 1.0) {
131        throw new IllegalArgumentException("Error " + error + " invalid: Expected number between 0.0 and 1.0.");
132      }
133      quantiles.add(new Quantile(quantile, error));
134      return this;
135    }
136
137    /**
138     * The class JavaDoc for {@link Summary} has more information on {@link #maxAgeSeconds(long)} 
139     * @see Summary
140     */
141    public Builder maxAgeSeconds(long maxAgeSeconds) {
142      if (maxAgeSeconds <= 0) {
143        throw new IllegalArgumentException("maxAgeSeconds cannot be " + maxAgeSeconds);
144      }
145      this.maxAgeSeconds = maxAgeSeconds;
146      return this;
147    }
148
149    /**
150     * The class JavaDoc for {@link Summary} has more information on {@link #ageBuckets(int)} 
151     * @see Summary
152     */
153    public Builder ageBuckets(int ageBuckets) {
154      if (ageBuckets <= 0) {
155        throw new IllegalArgumentException("ageBuckets cannot be " + ageBuckets);
156      }
157      this.ageBuckets = ageBuckets;
158      return this;
159    }
160
161    @Override
162    public Summary create() {
163      for (String label : labelNames) {
164        if (label.equals("quantile")) {
165          throw new IllegalStateException("Summary cannot have a label named 'quantile'.");
166        }
167      }
168      dontInitializeNoLabelsChild = true;
169      return new Summary(this);
170    }
171  }
172
173  /**
174   *  Return a Builder to allow configuration of a new Summary. Ensures required fields are provided.
175   *
176   *  @param name The name of the metric
177   *  @param help The help string of the metric
178   */
179  public static Builder build(String name, String help) {
180    return new Builder().name(name).help(help);
181  }
182
183  /**
184   *  Return a Builder to allow configuration of a new Summary.
185   */
186  public static Builder build() {
187    return new Builder();
188  }
189
190  @Override
191  protected Child newChild() {
192    return new Child(quantiles, maxAgeSeconds, ageBuckets);
193  }
194
195
196  /**
197   * Represents an event being timed.
198   */
199  public static class Timer implements Closeable {
200    private final Child child;
201    private final long start;
202    private Timer(Child child, long start) {
203      this.child = child;
204      this.start = start;
205    }
206    /**
207     * Observe the amount of time in seconds since {@link Child#startTimer} was called.
208     * @return Measured duration in seconds since {@link Child#startTimer} was called.
209     */
210    public double observeDuration() {
211      double elapsed = SimpleTimer.elapsedSecondsFromNanos(start, SimpleTimer.defaultTimeProvider.nanoTime());
212      child.observe(elapsed);
213      return elapsed;
214    }
215
216    /**
217     * Equivalent to calling {@link #observeDuration()}.
218     */
219    @Override
220    public void close() {
221      observeDuration();
222    }
223  }
224
225  /**
226   * The value of a single Summary.
227   * <p>
228   * <em>Warning:</em> References to a Child become invalid after using
229   * {@link SimpleCollector#remove} or {@link SimpleCollector#clear}.
230   */
231  public static class Child {
232
233    /**
234     * Executes runnable code (e.g. a Java 8 Lambda) and observes a duration of how long it took to run.
235     *
236     * @param timeable Code that is being timed
237     * @return Measured duration in seconds for timeable to complete.
238     */
239    public double time(Runnable timeable) {
240      Timer timer = startTimer();
241
242      double elapsed;
243      try {
244        timeable.run();
245      } finally {
246        elapsed = timer.observeDuration();
247      }
248      return elapsed;
249    }
250
251    /**
252     * Executes callable code (e.g. a Java 8 Lambda) and observes a duration of how long it took to run.
253     *
254     * @param timeable Code that is being timed
255     * @return Result returned by callable.
256     */
257    public <E> E time(Callable<E> timeable) {
258      Timer timer = startTimer();
259
260      try {
261        return timeable.call();
262      } catch (RuntimeException e) {
263        throw e;
264      } catch (Exception e) {
265        throw new RuntimeException(e);
266      } finally {
267        timer.observeDuration();
268      }
269    }
270
271    public static class Value {
272      public final double count;
273      public final double sum;
274      public final SortedMap<Double, Double> quantiles;
275      public final long created;
276
277      private Value(double count, double sum, List<Quantile> quantiles, TimeWindowQuantiles quantileValues, long created) {
278        this.count = count;
279        this.sum = sum;
280        this.quantiles = Collections.unmodifiableSortedMap(snapshot(quantiles, quantileValues));
281        this.created = created;
282      }
283
284      private SortedMap<Double, Double> snapshot(List<Quantile> quantiles, TimeWindowQuantiles quantileValues) {
285        SortedMap<Double, Double> result = new TreeMap<Double, Double>();
286        for (Quantile q : quantiles) {
287          result.put(q.quantile, quantileValues.get(q.quantile));
288        }
289        return result;
290      }
291    }
292
293    // Having these separate leaves us open to races,
294    // however Prometheus as whole has other races
295    // that mean adding atomicity here wouldn't be useful.
296    // This should be reevaluated in the future.
297    private final DoubleAdder count = new DoubleAdder();
298    private final DoubleAdder sum = new DoubleAdder();
299    private final List<Quantile> quantiles;
300    private final TimeWindowQuantiles quantileValues;
301    private final long created = System.currentTimeMillis();
302
303    private Child(List<Quantile> quantiles, long maxAgeSeconds, int ageBuckets) {
304      this.quantiles = quantiles;
305      if (quantiles.size() > 0) {
306        quantileValues = new TimeWindowQuantiles(quantiles.toArray(new Quantile[]{}), maxAgeSeconds, ageBuckets);
307      } else {
308        quantileValues = null;
309      }
310    }
311
312    /**
313     * Observe the given amount.
314     * @param amt in most cases amt should be &gt;= 0. Negative values are supported, but you should read
315     *            <a href="https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations">
316     *            https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations</a> for
317     *            implications and alternatives.
318     */
319    public void observe(double amt) {
320      count.add(1);
321      sum.add(amt);
322      if (quantileValues != null) {
323        quantileValues.insert(amt);
324      }
325    }
326    /**
327     * Start a timer to track a duration.
328     * <p>
329     * Call {@link Timer#observeDuration} at the end of what you want to measure the duration of.
330     */
331    public Timer startTimer() {
332      return new Timer(this, SimpleTimer.defaultTimeProvider.nanoTime());
333    }
334    /**
335     * Get the value of the Summary.
336     * <p>
337     * <em>Warning:</em> The definition of {@link Value} is subject to change.
338     */
339    public Value get() {
340      return new Value(count.sum(), sum.sum(), quantiles, quantileValues, created);
341    }
342  }
343
344  // Convenience methods.
345  /**
346   * Observe the given amount on the summary with no labels.
347   * @param amt in most cases amt should be &gt;= 0. Negative values are supported, but you should read
348   *            <a href="https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations">
349   *            https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations</a> for
350   *            implications and alternatives.
351   */
352  public void observe(double amt) {
353    noLabelsChild.observe(amt);
354  }
355  /**
356   * Start a timer to track a duration on the summary with no labels.
357   * <p>
358   * Call {@link Timer#observeDuration} at the end of what you want to measure the duration of.
359   */
360  public Timer startTimer() {
361    return noLabelsChild.startTimer();
362  }
363
364  /**
365   * Executes runnable code (e.g. a Java 8 Lambda) and observes a duration of how long it took to run.
366   *
367   * @param timeable Code that is being timed
368   * @return Measured duration in seconds for timeable to complete.
369   */
370  public double time(Runnable timeable){
371    return noLabelsChild.time(timeable);
372  }
373
374  /**
375   * Executes callable code (e.g. a Java 8 Lambda) and observes a duration of how long it took to run.
376   *
377   * @param timeable Code that is being timed
378   * @return Result returned by callable.
379   */
380  public <E> E time(Callable<E> timeable){
381    return noLabelsChild.time(timeable);
382  }
383
384  /**
385   * Get the value of the Summary.
386   * <p>
387   * <em>Warning:</em> The definition of {@link Child.Value} is subject to change.
388   */
389  public Child.Value get() {
390    return noLabelsChild.get();
391  }
392
393  @Override
394  public List<MetricFamilySamples> collect() {
395    List<MetricFamilySamples.Sample> samples = new ArrayList<MetricFamilySamples.Sample>();
396    for(Map.Entry<List<String>, Child> c: children.entrySet()) {
397      Child.Value v = c.getValue().get();
398      List<String> labelNamesWithQuantile = new ArrayList<String>(labelNames);
399      labelNamesWithQuantile.add("quantile");
400      for(Map.Entry<Double, Double> q : v.quantiles.entrySet()) {
401        List<String> labelValuesWithQuantile = new ArrayList<String>(c.getKey());
402        labelValuesWithQuantile.add(doubleToGoString(q.getKey()));
403        samples.add(new MetricFamilySamples.Sample(fullname, labelNamesWithQuantile, labelValuesWithQuantile, q.getValue()));
404      }
405      samples.add(new MetricFamilySamples.Sample(fullname + "_count", labelNames, c.getKey(), v.count));
406      samples.add(new MetricFamilySamples.Sample(fullname + "_sum", labelNames, c.getKey(), v.sum));
407      samples.add(new MetricFamilySamples.Sample(fullname + "_created", labelNames, c.getKey(), v.created / 1000.0));
408    }
409
410    return familySamplesList(Type.SUMMARY, samples);
411  }
412
413  @Override
414  public List<MetricFamilySamples> describe() {
415    return Collections.<MetricFamilySamples>singletonList(new SummaryMetricFamily(fullname, help, labelNames));
416  }
417
418}