From 8679a2e6c5f8cf18dad5055601a1a321990354f9 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Tue, 11 Mar 2014 13:57:48 -0400 Subject: [PATCH 1/3] ENH: Allow on kw to update. Work for 1 to many update. --- pandas/core/frame.py | 74 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fad348aed0..cda900518f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3003,10 +3003,9 @@ def combiner(x, y, needs_i8_conversion=False): return self.combine(other, combiner, overwrite=False) def update(self, other, join='left', overwrite=True, filter_func=None, - raise_conflict=False): + raise_conflict=False, on=None): """ - Modify DataFrame in place using non-NA values from passed - DataFrame. Aligns on indices + Modify DataFrame in place using non-NA values from passed DataFrame. Parameters ---------- @@ -3020,6 +3019,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None, raise_conflict : boolean If True, will raise an error if the DataFrame and other both contain data in the same place. + on : label or list, optional + Identify the column to should match up observations in other and + self. If None, other.reindex_like(self) is called so the index + must match to get a meaningful result. """ # TODO: Support other joins if join != 'left': # pragma: no cover @@ -3028,32 +3031,53 @@ def update(self, other, join='left', overwrite=True, filter_func=None, if not isinstance(other, DataFrame): other = DataFrame(other) - other = other.reindex_like(self) - - for col in self.columns: - this = self[col].values - that = other[col].values - if filter_func is not None: - mask = -filter_func(this) | isnull(that) - else: - if raise_conflict: - mask_this = notnull(that) - mask_that = notnull(this) - if any(mask_this & mask_that): - raise ValueError("Data overlaps.") - - if overwrite: - mask = isnull(that) + if on is None: + other = other.reindex(index=self.index) + else: + try: + old_index = self.index + self.set_index(on, inplace=True) + other.set_index(on, inplace=True) + other = other.reindex(index=self.index) + except Exception, err: + self.reset_index(inplace=True) + self.set_index(old_index) + raise(err) - # don't overwrite columns unecessarily - if mask.all(): - continue + try: + for col in other.columns: + if col not in self: # don't update what doesn't exist + continue + this = self[col].values + that = other[col].values + if filter_func is not None: + mask = -filter_func(this) | isnull(that) else: - mask = notnull(this) + if raise_conflict: + mask_this = notnull(that) + mask_that = notnull(this) + if any(mask_this & mask_that): + raise ValueError("Data overlaps.") + + if overwrite: + mask = isnull(that) + + # don't overwrite columns unecessarily + if mask.all(): + continue + else: + mask = notnull(this) - self[col] = expressions.where( - mask, this, that, raise_on_error=True) + self[col] = expressions.where( + mask, this, that, raise_on_error=True) + except Exception, err: + raise(err) + + finally: + if on is not None: + self.reset_index(inplace=True) + self.set_index(old_index) #---------------------------------------------------------------------- # Misc methods From 1b3d90419a8a3deda4a8fb15765ab52305f1d41a Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Tue, 11 Mar 2014 15:22:48 -0400 Subject: [PATCH 2/3] TST: Test on keyword for frame.update. --- pandas/tests/test_frame.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3c39d610c1..0326a73e52 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -24,7 +24,7 @@ from numpy.random import randn import numpy as np import numpy.ma as ma -from numpy.testing import assert_array_equal +from numpy.testing import assert_array_equal, assert_ import numpy.ma.mrecords as mrecords import pandas.core.nanops as nanops @@ -9974,6 +9974,43 @@ def test_update(self): [1.5, nan, 7.]]) assert_frame_equal(df, expected) + def test_update_on(self): + df = DataFrame([[np.nan, 'A'], + [np.nan, 'A'], + [np.nan, 'A'], + [1.5, 'B'], + [2.2, 'C'], + [3.1, 'C'], + [1.2, 'B']], columns=['number', 'name']) + + df2 = DataFrame([[3.5, 'A']], columns=['number', 'name']) + + expected = DataFrame([[3.5, 'A'], + [3.5, 'A'], + [3.5, 'A'], + [1.5, 'B'], + [2.2, 'C'], + [3.1, 'C'], + [1.2, 'B']], columns=['number', 'name']) + df.update(df2, on='name') + assert_frame_equal(df, expected) + + df = DataFrame([[np.nan, 'A'], + [np.nan, 'A'], + [np.nan, 'A'], + [1.5, 'B'], + [2.2, 'C'], + [3.1, 'C'], + [1.2, 'B']], columns=['number', 'name']) + + df2 = DataFrame([[3.5, 'A'], [2.5, 'A']], + columns=['number', 'name']) + + assertRaises(ValueError, df.update, df2, on='name') + + ## and the index should be reset + assert_(df.index.equals(pd.Index(range(7)))) + def test_update_dtypes(self): # gh 3016 From d3e309969b643f9f1c7c74728fb67fa2a1e3bdf4 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Tue, 11 Mar 2014 15:32:11 -0400 Subject: [PATCH 3/3] ENH: Try to preserve column order. --- pandas/core/frame.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cda900518f..4811aff3a3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3036,6 +3036,7 @@ def update(self, other, join='left', overwrite=True, filter_func=None, else: try: old_index = self.index + col_order = self.columns self.set_index(on, inplace=True) other.set_index(on, inplace=True) other = other.reindex(index=self.index) @@ -3078,6 +3079,8 @@ def update(self, other, join='left', overwrite=True, filter_func=None, if on is not None: self.reset_index(inplace=True) self.set_index(old_index) + self = self[col_order] + #---------------------------------------------------------------------- # Misc methods