From 7623bd5fd761b8f895f34303fc2636e70855d3a5 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 28 Feb 2019 00:12:53 -0800 Subject: [PATCH 1/5] Fix mode() and add multimode() --- Lib/statistics.py | 66 +++++++++++++++++++++---------------- Lib/test/test_statistics.py | 17 +++++----- 2 files changed, 46 insertions(+), 37 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index e85aaa996cc7b4..005d274a60849b 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -17,6 +17,7 @@ median_high High median of data. median_grouped Median, or 50th percentile, of grouped data. mode Mode (most common value) of data. +multimode List of modes (most common values of data) ================== ============================================= Calculate the arithmetic mean ("the average") of data: @@ -82,7 +83,6 @@ 'mean', 'mode', 'harmonic_mean', 'fmean', ] -import collections import math import numbers import random @@ -92,8 +92,8 @@ from itertools import groupby from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum - - +from operator import itemgetter +from collections import Counter # === Exceptions === @@ -249,20 +249,6 @@ def _convert(value, T): raise -def _counts(data): - # Generate a table of sorted (value, frequency) pairs. - table = collections.Counter(iter(data)).most_common() - if not table: - return table - # Extract the values with the highest frequency. - maxfreq = table[0][1] - for i in range(1, len(table)): - if table[i][1] != maxfreq: - table = table[:i] - break - return table - - def _find_lteq(a, x): 'Locate the leftmost value exactly equal to x' i = bisect_left(a, x) @@ -523,19 +509,38 @@ def mode(data): >>> mode(["red", "blue", "blue", "red", "green", "red", "red"]) 'red' - If there is not exactly one most common value, ``mode`` will raise - StatisticsError. + If there are multiple modes, return the first one encountered. + + >>> mode(['red', 'red', 'green', 'blue', 'blue']) + 'red' + + If *data* is empty, ``mode``, raises StatisticsError. + """ - # Generate a table of sorted (value, frequency) pairs. - table = _counts(data) - if len(table) == 1: - return table[0][0] - elif table: - raise StatisticsError( - 'no unique mode; found %d equally common values' % len(table) - ) - else: - raise StatisticsError('no mode for empty data') + data = iter(data) + try: + return Counter(data).most_common(1)[0][0] + except IndexError: + raise StatisticsError('no mode for empty data') from None + + +def multimode(data): + """ Return a list of the most frequently occurring values. + + Will return more than one result is there are multiple modes + or an empty list if *data* is empty. + + >>> multimode('aabbbbbbbbcc') + ['b'] + >>> multimode('aabbbbccddddeeffffgg') + ['b', 'd', 'f'] + >>> multimode('') + [] + + """ + counts = Counter(iter(data)).most_common() + maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, [])) + return list(map(itemgetter(0), mode_items)) # === Measures of spread === @@ -836,6 +841,7 @@ def __repr__(self): from math import isclose from operator import add, sub, mul, truediv from itertools import repeat + import doctest g1 = NormalDist(10, 20) g2 = NormalDist(-5, 25) @@ -893,3 +899,5 @@ def assert_close(G1, G2): S = NormalDist.from_samples([x - y for x, y in zip(X.samples(n), Y.samples(n))]) assert_close(X - Y, S) + + print(doctest.testmod()) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index a63e4bf6cc84d7..274fbf6cd66096 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -1769,7 +1769,7 @@ def prepare_data(self): def test_range_data(self): # Override test from UnivariateCommonMixin. data = range(20, 50, 3) - self.assertRaises(statistics.StatisticsError, self.func, data) + self.assertEqual(self.func(data), 20) def test_nominal_data(self): # Test mode with nominal data. @@ -1790,13 +1790,14 @@ def test_bimodal_data(self): # Test mode with bimodal data. data = [1, 1, 2, 2, 2, 2, 3, 4, 5, 6, 6, 6, 6, 7, 8, 9, 9] assert data.count(2) == data.count(6) == 4 - # Check for an exception. - self.assertRaises(statistics.StatisticsError, self.func, data) + # mode() should return 2, the first encounted mode + self.assertEqual(self.func(data), 2) - def test_unique_data_failure(self): - # Test mode exception when data points are all unique. + def test_unique_data(self): + # Test mode when data points are all unique. data = list(range(10)) - self.assertRaises(statistics.StatisticsError, self.func, data) + # mode() should return 0, the first encounted mode + self.assertEqual(self.func(data), 0) def test_none_data(self): # Test that mode raises TypeError if given None as data. @@ -1809,8 +1810,8 @@ def test_counter_data(self): # Test that a Counter is treated like any other iterable. data = collections.Counter([1, 1, 1, 2]) # Since the keys of the counter are treated as data points, not the - # counts, this should raise. - self.assertRaises(statistics.StatisticsError, self.func, data) + # counts, this should return the first mode encountered, 1 + self.assertEqual(self.func(data), 1) class TestFMean(unittest.TestCase): From a6c65d2d0759f74bd1bef27de6bf36b4d3d8b2e1 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 11 Mar 2019 22:51:33 -0700 Subject: [PATCH 2/5] Unnecessary module lookup --- Lib/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 005d274a60849b..b418e55a4cc0b7 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -320,9 +320,9 @@ def count(x): nonlocal n n += 1 return x - total = math.fsum(map(count, data)) + total = fsum(map(count, data)) else: - total = math.fsum(data) + total = fsum(data) try: return total / n except ZeroDivisionError: From 4a1871588a7859d629c7dcf00addb7c7550eaac1 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 11 Mar 2019 23:52:23 -0700 Subject: [PATCH 3/5] Update documentation --- Doc/library/statistics.rst | 36 +++++++++++++++++++++++++++++------- Doc/whatsnew/3.8.rst | 8 ++++++++ Lib/statistics.py | 4 ++-- 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 81119da0a38236..4ee21331433be1 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -37,7 +37,7 @@ Averages and measures of central location These functions calculate an average or typical value from a population or sample. -======================= ============================================= +======================= ==================================================== :func:`mean` Arithmetic mean ("average") of data. :func:`fmean` Fast, floating point arithmetic mean. :func:`harmonic_mean` Harmonic mean of data. @@ -46,7 +46,8 @@ or sample. :func:`median_high` High median of data. :func:`median_grouped` Median, or 50th percentile, of grouped data. :func:`mode` Mode (most common value) of discrete data. -======================= ============================================= +:func:`multimode` List of modes (most common values) of discrete data. +======================= ==================================================== Measures of spread ------------------ @@ -287,12 +288,12 @@ However, for reading convenience, most of the examples show sorted sequences. .. function:: mode(data) - Return the most common data point from discrete or nominal *data*. The mode - (when it exists) is the most typical value, and is a robust measure of - central location. + Return the single most common data point from discrete or nominal *data*. + The mode (when it exists) is the most typical value, and is a robust + measure of central location. - If *data* is empty, or if there is not exactly one most common value, - :exc:`StatisticsError` is raised. + If there are multiple modes, returns the first one encountered in the *data*. + If *data* is empty, :exc:`StatisticsError` is raised. ``mode`` assumes discrete data, and returns a single value. This is the standard treatment of the mode as commonly taught in schools: @@ -310,6 +311,27 @@ However, for reading convenience, most of the examples show sorted sequences. >>> mode(["red", "blue", "blue", "red", "green", "red", "red"]) 'red' + .. versionchanged:: 3.8 + Now handles multimodal datasets by returning the first mode encountered. + Formerly, it raised :exc:`StatisticsError` when more than one mode was + found. + + +.. function:: multimode(data) + + Return a list of the most frequently occurring values in the order they + were first encountered in the *data*. Will return more than one result if + there are multiple modes or an empty list if *data* is empty: + + .. doctest:: + + >>> multimode('aabbbbccddddeeffffgg') + ['b', 'd', 'f'] + >>> multimode('') + [] + + .. versionadded:: 3.8 + .. function:: pstdev(data, mu=None) diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst index 9cd5a3a937dc9f..ad86917d0cc71a 100644 --- a/Doc/whatsnew/3.8.rst +++ b/Doc/whatsnew/3.8.rst @@ -282,6 +282,9 @@ Added :func:`statistics.fmean` as a faster, floating point variant of :func:`statistics.mean()`. (Contributed by Raymond Hettinger and Steven D'Aprano in :issue:`35904`.) +Added :func:`statistics.multimode` that returns a list of the most +common values. (Contributed by Raymond Hettinger in :issue:`35892`.) + Added :class:`statistics.NormalDist`, a tool for creating and manipulating normal distributions of a random variable. (Contributed by Raymond Hettinger in :issue:`36018`.) @@ -591,6 +594,11 @@ Changes in the Python API * The function :func:`platform.popen` has been removed, it was deprecated since Python 3.3: use :func:`os.popen` instead. +* The :func:`statistics.mode` function no longer raises an exception + when given multimodal data. Instead, it returns the first mode + encountered in the input data. (Contributed by Raymond Hettinger + in :issue:`35892`.) + * The :meth:`~tkinter.ttk.Treeview.selection` method of the :class:`tkinter.ttk.Treeview` class no longer takes arguments. Using it with arguments for changing the selection was deprecated in Python 3.6. Use diff --git a/Lib/statistics.py b/Lib/statistics.py index b418e55a4cc0b7..97f154373dc0b4 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -80,7 +80,7 @@ __all__ = [ 'StatisticsError', 'NormalDist', 'pstdev', 'pvariance', 'stdev', 'variance', 'median', 'median_low', 'median_high', 'median_grouped', - 'mean', 'mode', 'harmonic_mean', 'fmean', + 'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean', ] import math @@ -527,7 +527,7 @@ def mode(data): def multimode(data): """ Return a list of the most frequently occurring values. - Will return more than one result is there are multiple modes + Will return more than one result if there are multiple modes or an empty list if *data* is empty. >>> multimode('aabbbbbbbbcc') From 337ab822673a87668e01e0a03a5ad8b32ebea7a4 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 11 Mar 2019 23:57:28 -0700 Subject: [PATCH 4/5] Add tests --- Lib/test/test_statistics.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 274fbf6cd66096..26b22a1c408024 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -1813,6 +1813,16 @@ def test_counter_data(self): # counts, this should return the first mode encountered, 1 self.assertEqual(self.func(data), 1) + +class TestMultiMode(unittest.TestCase): + + def test_basics(self): + multimode = statistics.multimode + self.assertEqual(multimode('aabbbbbbbbcc'), ['b']) + self.assertEqual(multimode('aabbbbccddddeeffffgg'), ['b', 'd', 'f']) + self.assertEqual(multimode(''), []) + + class TestFMean(unittest.TestCase): def test_basics(self): From fea270f16c7e122d4692b28ef987579bdab28970 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 12 Mar 2019 00:21:18 -0700 Subject: [PATCH 5/5] Minor doc refinements --- Doc/library/statistics.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 4ee21331433be1..97e1c3a0a1c2c6 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -37,7 +37,7 @@ Averages and measures of central location These functions calculate an average or typical value from a population or sample. -======================= ==================================================== +======================= =============================================================== :func:`mean` Arithmetic mean ("average") of data. :func:`fmean` Fast, floating point arithmetic mean. :func:`harmonic_mean` Harmonic mean of data. @@ -45,9 +45,9 @@ or sample. :func:`median_low` Low median of data. :func:`median_high` High median of data. :func:`median_grouped` Median, or 50th percentile, of grouped data. -:func:`mode` Mode (most common value) of discrete data. -:func:`multimode` List of modes (most common values) of discrete data. -======================= ==================================================== +:func:`mode` Single mode (most common value) of discrete or nominal data. +:func:`multimode` List of modes (most common values) of discrete or nomimal data. +======================= =============================================================== Measures of spread ------------------ @@ -289,7 +289,7 @@ However, for reading convenience, most of the examples show sorted sequences. .. function:: mode(data) Return the single most common data point from discrete or nominal *data*. - The mode (when it exists) is the most typical value, and is a robust + The mode (when it exists) is the most typical value and serves as a measure of central location. If there are multiple modes, returns the first one encountered in the *data*. @@ -321,7 +321,7 @@ However, for reading convenience, most of the examples show sorted sequences. Return a list of the most frequently occurring values in the order they were first encountered in the *data*. Will return more than one result if - there are multiple modes or an empty list if *data* is empty: + there are multiple modes or an empty list if the *data* is empty: .. doctest::