diff --git a/RELEASE.rst b/RELEASE.rst index c632506429..ad3c0ecc1d 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -94,9 +94,9 @@ Thanks ------ - Craig Austin - Andreas Hilboll -- Adam Klein - Matt Harrison - Arthur Gerigk +- Adam Klein - Gregg Lind - Solomon Negusse - Wouter Overmeire diff --git a/TODO.rst b/TODO.rst index 55546e3cb3..84a2ecd715 100644 --- a/TODO.rst +++ b/TODO.rst @@ -1,3 +1,11 @@ +LongPanel removal +================= + +- DONE level to flex methods +- DONE level to reindex +- ?? fast take for items + + DONE ---- - SparseSeries name integration + tests diff --git a/pandas/core/api.py b/pandas/core/api.py index 5c2423ab93..1201bde943 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -10,7 +10,7 @@ from pandas.core.daterange import DateRange from pandas.core.series import Series, TimeSeries from pandas.core.frame import DataFrame -from pandas.core.panel import Panel, LongPanel +from pandas.core.panel import Panel from pandas.core.groupby import groupby from pandas.core.reshape import pivot_simple as pivot diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8674a11357..fc8bff61f4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -36,6 +36,7 @@ import pandas.core.nanops as nanops import pandas.core.common as com +import pandas.core.generic as generic import pandas.core.datetools as datetools import pandas._tseries as lib @@ -100,11 +101,11 @@ def _add_stat_doc(f, name, shortname, na_action=_doc_exclude_na, f.__doc__ = doc def _arith_method(func, name, default_axis='columns'): - def f(self, other, axis=default_axis, fill_value=None): + def f(self, other, axis=default_axis, level=None, fill_value=None): if isinstance(other, DataFrame): # Another DataFrame - return self._combine_frame(other, func, fill_value) + return self._combine_frame(other, func, fill_value, level) elif isinstance(other, Series): - return self._combine_series(other, func, fill_value, axis) + return self._combine_series(other, func, fill_value, axis, level) else: return self._combine_const(other, func) @@ -375,6 +376,18 @@ def iteritems(self): """Iterator over (column, series) pairs""" return ((k, self[k]) for k in self.columns) + def iterrows(self): + """ + Iterate over rows of DataFrame as (index, Series) pairs + """ + from itertools import izip + columns = self.columns + for k, v in izip(self.index, self.values): + s = v.view(Series) + s.index = columns + s.name = k + yield k, s + iterkv = iteritems if py3compat.PY3: # pragma: no cover items = iteritems @@ -687,6 +700,70 @@ def to_sparse(self, fill_value=None, kind='block'): default_kind=kind, default_fill_value=fill_value) + def to_panel(self): + """ + Transform long (stacked) format (DataFrame) into wide (3D, Panel) + format. + + Currently the index of the DataFrame must be a 2-level MultiIndex. This + may be generalized later + + Returns + ------- + panel : Panel + """ + from pandas.core.panel import Panel + + wide_shape = (len(self.columns), len(self.index.levels[0]), + len(self.index.levels[1])) + + # only support this kind for now + assert(isinstance(self.index, MultiIndex) and + len(self.index.levels) == 2) + + major_axis, minor_axis = self.index.levels + + def make_mask(index): + """ + Create observation selection vector using major and minor + labels, for converting to wide format. + """ + N, K = index.levshape + selector = index.labels[1] + K * index.labels[0] + mask = np.zeros(N * K, dtype=bool) + mask.put(selector, True) + return mask + + def _to_wide_homogeneous(): + values = np.empty(wide_shape, dtype=self.values.dtype) + if not issubclass(values.dtype.type, np.integer): + values.fill(np.nan) + + frame_values = self.values + for i in xrange(len(self.columns)): + values[i].flat[mask] = frame_values[:, i] + return Panel(values, self.columns, major_axis, minor_axis) + + def _to_wide_mixed(): + _, N, K = wide_shape + # TODO: make much more efficient + data = {} + for item in self.columns: + item_vals = self[item].values + values = np.empty((N, K), dtype=item_vals.dtype) + values.flat[mask] = item_vals + data[item] = DataFrame(values, index=major_axis, + columns=minor_axis) + return Panel(data, self.columns, major_axis, minor_axis) + + mask = make_mask(self.index) + if self._is_mixed_type: + return _to_wide_mixed() + else: + return _to_wide_homogeneous() + + to_wide = deprecate('to_wide', to_panel) + def to_csv(self, path, sep=",", na_rep='', cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None): """ @@ -1258,7 +1335,7 @@ def xs(self, key, axis=0, copy=True): #---------------------------------------------------------------------- # Reindexing and alignment - def align(self, other, join='outer', axis=None, copy=True): + def align(self, other, join='outer', axis=None, level=None, copy=True): """ Align two DataFrame object on their index and columns with the specified join method for each axis Index @@ -1276,13 +1353,16 @@ def align(self, other, join='outer', axis=None, copy=True): Aligned Series """ if isinstance(other, DataFrame): - return self._align_frame(other, join=join, axis=axis, copy=copy) + return self._align_frame(other, join=join, axis=axis, level=level, + copy=copy) elif isinstance(other, Series): - return self._align_series(other, join=join, axis=axis, copy=copy) + return self._align_series(other, join=join, axis=axis, level=level, + copy=copy) else: # pragma: no cover raise TypeError('unsupported type: %s' % type(other)) - def _align_frame(self, other, join='outer', axis=None, copy=True): + def _align_frame(self, other, join='outer', axis=None, level=None, + copy=True): # defaults join_index = self.index join_columns = self.columns @@ -1291,14 +1371,15 @@ def _align_frame(self, other, join='outer', axis=None, copy=True): if axis is None or axis == 0: if not self.index.equals(other.index): - join_index, ilidx, iridx = self.index.join(other.index, how=join, - return_indexers=True) + join_index, ilidx, iridx = \ + self.index.join(other.index, how=join, level=level, + return_indexers=True) if axis is None or axis == 1: if not self.columns.equals(other.columns): - join_columns, clidx, cridx = self.columns.join(other.columns, - how=join, - return_indexers=True) + join_columns, clidx, cridx = \ + self.columns.join(other.columns, how=join, level=level, + return_indexers=True) def _align(frame, row_idx, col_idx): new_data = frame._data @@ -1307,7 +1388,8 @@ def _align(frame, row_idx, col_idx): if col_idx is not None: # TODO: speed up on homogeneous DataFrame objects - new_data = new_data.reindex_items(join_columns) + new_data = new_data.reindex_indexer(join_columns, col_idx, + axis=0) if copy and new_data is frame._data: new_data = new_data.copy() @@ -1318,7 +1400,17 @@ def _align(frame, row_idx, col_idx): right = _align(other, iridx, cridx) return left, right - def _align_series(self, other, join='outer', axis=None, copy=True): + def _align_level(self, multi_index, level, axis=0, copy=True): + levnum = multi_index._get_level_number(level) + data = self.reindex_axis(multi_index.levels[levnum], axis=axis, + copy=False)._data + mgr_axis = 0 if axis == 1 else 1 + new_data = data.reindex_indexer(multi_index, multi_index.labels[levnum], + axis=mgr_axis) + return DataFrame(new_data) + + def _align_series(self, other, join='outer', axis=None, level=None, + copy=True): fdata = self._data if axis == 0: join_index = self.index @@ -1333,11 +1425,12 @@ def _align_series(self, other, join='outer', axis=None, copy=True): join_index = self.columns lidx, ridx = None, None if not self.columns.equals(other.index): - join_index, lidx, ridx = self.columns.join(other.index, how=join, - return_indexers=True) + join_index, lidx, ridx = \ + self.columns.join(other.index, how=join, + return_indexers=True) if lidx is not None: - fdata = fdata.reindex_items(join_index) + fdata = fdata.reindex_indexer(join_index, lidx, axis=0) else: raise ValueError('Must specify axis=0 or 1') @@ -1348,8 +1441,9 @@ def _align_series(self, other, join='outer', axis=None, copy=True): right_result = other if ridx is None else other.reindex(join_index) return left_result, right_result - def reindex(self, index=None, columns=None, method=None, copy=True): - """Conform Series to new index with optional filling logic, placing + def reindex(self, index=None, columns=None, method=None, level=None, + copy=True): + """Conform DataFrame to new index with optional filling logic, placing NA/NaN in locations having no value in the previous index. A new object is produced unless the new index is equivalent to the current one and copy=False @@ -1380,33 +1474,65 @@ def reindex(self, index=None, columns=None, method=None, copy=True): frame = self if index is not None: - index = _ensure_index(index) - frame = frame._reindex_index(index, method, copy) + frame = frame._reindex_index(index, method, copy, level) if columns is not None: - columns = _ensure_index(columns) - frame = frame._reindex_columns(columns, copy) + frame = frame._reindex_columns(columns, copy, level) return frame - def _reindex_index(self, new_index, method, copy): - if new_index.equals(self.index): - if copy: - result = self.copy() - result.index = new_index - return result - else: - return self - new_data = self._data.reindex_axis(new_index, method, axis=1) + def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True): + """Conform DataFrame to new index with optional filling logic, placing + NA/NaN in locations having no value in the previous index. A new object + is produced unless the new index is equivalent to the current one and + copy=False + + Parameters + ---------- + index : array-like, optional + New labels / index to conform to. Preferably an Index object to + avoid duplicating data + axis : {0, 1} + 0 -> index (rows) + 1 -> columns + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed DataFrame + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + copy : boolean, default True + Return a new object, even if the passed indexes are the same + + Examples + -------- + >>> df.reindex(['A', 'B', 'C'], axis=1) + + See also + -------- + DataFrame.reindex, DataFrame.reindex_like + + Returns + ------- + reindexed : same type as calling instance + """ + self._consolidate_inplace() + if axis == 0: + return self._reindex_index(labels, method, copy, level) + elif axis == 1: + return self._reindex_columns(labels, copy, level) + else: # pragma: no cover + raise ValueError('Must specify axis=0 or 1') + + def _reindex_index(self, new_index, method, copy, level): + if level is not None: + return self._align_level(new_index, level, axis=0, copy=copy) + new_data = self._data.reindex_axis(new_index, method, axis=1, + copy=copy) return self._constructor(new_data) - def _reindex_columns(self, new_columns, copy): - if new_columns.equals(self.columns): - if copy: - return self.copy() - else: - return self - new_data = self._data.reindex_axis(new_columns, axis=0) + def _reindex_columns(self, new_columns, copy, level): + if level is not None: + return self._align_level(new_columns, level, axis=1, copy=copy) + new_data = self._data.reindex_axis(new_columns, axis=0, copy=copy) return self._constructor(new_data) def reindex_like(self, other, method=None, copy=True): @@ -1432,6 +1558,8 @@ def reindex_like(self, other, method=None, copy=True): return self.reindex(index=other.index, columns=other.columns, method=method, copy=copy) + truncate = generic.truncate + def set_index(self, col_or_cols, drop=True, inplace=False): """ Set the DataFrame index (row labels) using one or more existing @@ -1890,8 +2018,8 @@ def _rename_columns_inplace(self, mapper): #---------------------------------------------------------------------- # Arithmetic / combination related - def _combine_frame(self, other, func, fill_value=None): - this, other = self.align(other, join='outer', copy=False) + def _combine_frame(self, other, func, fill_value=None, level=None): + this, other = self.align(other, join='outer', level=level, copy=False) new_index, new_columns = this.index, this.columns this_vals = this.values @@ -1917,7 +2045,8 @@ def _indexed_same(self, other): same_columns = self.columns.equals(other.columns) return same_index and same_columns - def _combine_series(self, other, func, fill_value=None, axis=None): + def _combine_series(self, other, func, fill_value=None, axis=None, + level=None): if axis is not None: axis = self._get_axis_name(axis) if axis == 'index': @@ -2458,31 +2587,12 @@ def _apply_broadcast(self, func, axis): return result - def _apply_level(self, f, axis='major', broadcast=False): - from pandas.core.panel import LongPanel - - if axis == 'major': - panel = self.swapaxes() - result = panel._apply_level(f, axis='minor', broadcast=broadcast) - if broadcast: - result = result.swapaxes() - - return result - - bounds = self.index._bounds - values = self.values - N, _ = values.shape - result = group_agg(values, bounds, f) - + def _apply_level(self, f, level=0, broadcast=False): + grouped = self.groupby(level=level) if broadcast: - repeater = np.concatenate((np.diff(bounds), [N - bounds[-1]])) - panel = LongPanel(result.repeat(repeater, axis=0), - columns=self.items, index=self.index) + return grouped.transform(f) else: - panel = DataFrame(result, index=self.major_axis, - columns=self.items) - - return panel + return grouped.agg(f) def applymap(self, func): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bbf9fde762..3fd71b1004 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3,7 +3,7 @@ import numpy as np from pandas.core.common import save, load -from pandas.core.index import _ensure_index +from pandas.core.index import MultiIndex import pandas.core.datetools as datetools #------------------------------------------------------------------------------- @@ -100,26 +100,6 @@ def groupby(self, by=None, axis=0, level=None, as_index=True): from pandas.core.groupby import groupby return groupby(self, by, axis=axis, level=level, as_index=as_index) - def truncate(self, before=None, after=None): - """Function truncate a sorted DataFrame / Series before and/or after - some particular dates. - - Parameters - ---------- - before : date - Truncate before date - after : date - Truncate after date - - Returns - ------- - truncated : type of caller - """ - before = datetools.to_datetime(before) - after = datetools.to_datetime(after) - # returns view, want to copy - return self.ix[before:after].copy() - def select(self, crit, axis=0): """ Return data corresponding to axis labels matching criteria @@ -352,17 +332,13 @@ def _is_mixed_type(self): return len(self._data.blocks) > 1 def _reindex_axis(self, new_index, fill_method, axis, copy): - new_index = _ensure_index(new_index) - cur_axis = self._data.axes[axis] - if cur_axis.equals(new_index) and not copy: - return self + new_data = self._data.reindex_axis(new_index, axis=axis, + method=fill_method, copy=copy) - if axis == 0: - new_data = self._data.reindex_items(new_index) + if new_data is self._data and not copy: + return self else: - new_data = self._data.reindex_axis(new_index, axis=axis, - method=fill_method) - return self._constructor(new_data) + return self._constructor(new_data) def cumsum(self, axis=None, skipna=True): """ @@ -568,8 +544,42 @@ def take(self, indices, axis=0): if axis == 0: labels = self._get_axis(axis) new_items = labels.take(indices) - new_data = self._data.reindex_items(new_items) + new_data = self._data.reindex_axis(new_items, axis=0) else: new_data = self._data.take(indices, axis=axis) return self._constructor(new_data) +# Good for either Series or DataFrame + +def truncate(self, before=None, after=None, copy=True): + """Function truncate a sorted DataFrame / Series before and/or after + some particular dates. + + Parameters + ---------- + before : date + Truncate before date + after : date + Truncate after date + + Returns + ------- + truncated : type of caller + """ + before = datetools.to_datetime(before) + after = datetools.to_datetime(after) + + if before is not None and after is not None: + assert(before <= after) + + left, right = self.index.slice_locs(before, after) + result = self[left:right] + + if isinstance(self.index, MultiIndex): + result.index = self.index.truncate(before, after) + + if copy: + result = result.copy() + + return result + diff --git a/pandas/core/index.py b/pandas/core/index.py index 71a0d30911..dc2672d2d4 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -559,11 +559,32 @@ def reindex(self, target, method=None): indexer = self.get_indexer(target, method=method) return target, indexer - def join(self, other, how='left', return_indexers=False): + def join(self, other, how='left', level=None, return_indexers=False): + """ + Internal API method. Compute join_index and indexers to conform data + structures to the new index. + + Parameters + ---------- + other : Index + how : {'left', 'right', 'inner', 'outer'} + level : + return_indexers : boolean, default False + + Returns + ------- + join_index, (left_indexer, right_indexer) + """ + if (level is not None and (isinstance(self, MultiIndex) or + isinstance(other, MultiIndex))): + return self._join_level(other, level, how=how, + return_indexers=return_indexers) + if self.dtype != other.dtype: this = self.astype('O') other = other.astype('O') - return this.join(other, how=how, return_indexers=return_indexers) + return this.join(other, how=how, + return_indexers=return_indexers) if self.is_monotonic and other.is_monotonic: return self._join_monotonic(other, how=how, @@ -593,6 +614,61 @@ def join(self, other, how='left', return_indexers=False): else: return join_index + def _join_level(self, other, level, how='left', return_indexers=False): + """ + The join method *only* affects the level of the resulting + MultiIndex. Otherwise it just exactly aligns the Index data to the + labels of the level in the MultiIndex. The order of the data indexed by + the MultiIndex will not be changed (currently) + """ + + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): + raise Exception('Join on level between two MultiIndex objects ' + 'is ambiguous') + + left, right = self, other + + flip_order = not isinstance(self, MultiIndex) + if flip_order: + left, right = right, left + + level = left._get_level_number(level) + + old_level = left.levels[level] + + new_level, left_lev_indexer, right_lev_indexer = \ + old_level.join(right, how=how, return_indexers=True) + + if left_lev_indexer is not None: + rev_indexer = lib.get_reverse_indexer(left_lev_indexer, + len(old_level)) + + new_labels = list(left.labels) + new_labels[level] = rev_indexer.take(left.labels[level]) + + new_levels = list(left.levels) + new_levels[level] = new_level + + join_index = MultiIndex(levels=new_levels, labels=new_labels, + names=left.names) + else: + join_index = left + + left_indexer = None + + if right_lev_indexer is not None: + right_indexer = right_lev_indexer.take(join_index.labels[level]) + else: + right_indexer = join_index.labels[level] + + if flip_order: + left_indexer, right_indexer = right_indexer, left_indexer + + if return_indexers: + return join_index, left_indexer, right_indexer + else: + return join_index + def _join_monotonic(self, other, how='left', return_indexers=False): if how == 'left': join_index = self @@ -905,6 +981,22 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None): return subarr + def copy(self, order='C'): + """ + Overridden ndarray.copy to copy over attributes + + Returns + ------- + cp : Index + Returns view on same base ndarray + """ + cp = self.view(np.ndarray).view(type(self)) + cp.levels = list(self.levels) + cp.labels = list(self.labels) + cp.names = list(self.names) + cp.sortorder = self.sortorder + return cp + @property def dtype(self): return np.dtype('O') @@ -941,6 +1033,23 @@ def _has_complex_internals(self): # to disable groupby tricks return True + @property + def has_duplicates(self): + """ + Return True if there are no unique groups + """ + # has duplicates + shape = [len(lev) for lev in self.levels] + group_index = np.zeros(len(self), dtype='i8') + for i in xrange(len(shape)): + stride = np.prod([x for x in shape[i+1:]], dtype='i8') + group_index += self.labels[i] * stride + + if len(np.unique(group_index)) < len(group_index): + return True + + return False + def get_level_values(self, level): """ Return vector of label values for requested level, equal to the length @@ -1103,7 +1212,7 @@ def __getitem__(self, key): # an optimization result = new_tuples.view(MultiIndex) - result.levels = self.levels + result.levels = list(self.levels) result.labels = new_labels result.sortorder = sortorder result.names = self.names diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ea90e63534..29809060f5 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -292,6 +292,10 @@ class _SeriesIndexer(_NDFrameIndexer): def __getitem__(self, key): ax = self.obj.index + + if isinstance(key, slice): + key = self._convert_slice(key) + if isinstance(ax, MultiIndex): try: # key = ax.get_loc(key) @@ -302,7 +306,7 @@ def __getitem__(self, key): if _isboolarr(key): self._check_boolean_key(key) elif isinstance(key, slice): - key = self._convert_slice(key) + pass elif _is_list_like(key): return self._get_list_like(key) return self._get_default(key) @@ -398,10 +402,12 @@ def _is_list_like(obj): def _is_label_slice(labels, obj): def crit(x): - if x in labels: + try: + _ = labels.get_loc(x) return False - else: + except KeyError: return isinstance(x, int) or x is None + return not crit(obj.start) or not crit(obj.stop) def _need_slice(obj): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index aafbfe550c..f7ee6431d0 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1,3 +1,4 @@ +from collections import defaultdict import itertools from numpy import nan @@ -442,7 +443,7 @@ def copy(self, deep=True): """ copy_blocks = [block.copy(deep=deep) for block in self.blocks] # copy_axes = [ax.copy() for ax in self.axes] - copy_axes = self.axes + copy_axes = list(self.axes) return BlockManager(copy_blocks, copy_axes) def as_matrix(self, items=None): @@ -643,15 +644,32 @@ def _check_have(self, item): if item not in self.items: raise KeyError('no item named %s' % str(item)) - def reindex_axis(self, new_axis, method=None, axis=0): + def reindex_axis(self, new_axis, method=None, axis=0, copy=True): + new_axis = _ensure_index(new_axis) + cur_axis = self.axes[axis] + + if new_axis.equals(cur_axis): + if copy: + result = self.copy(deep=True) + result.axes[axis] = new_axis + return result + else: + return self + if axis == 0: assert(method is None) return self.reindex_items(new_axis) - new_axis = _ensure_index(new_axis) - cur_axis = self.axes[axis] - new_axis, indexer = cur_axis.reindex(new_axis, method) + return self.reindex_indexer(new_axis, indexer, axis=axis) + + def reindex_indexer(self, new_axis, indexer, axis=1): + """ + pandas-indexer with -1's only + """ + if axis == 0: + return self._reindex_indexer_items(new_axis, indexer) + mask = indexer == -1 # TODO: deal with length-0 case? or does it fall out? @@ -667,23 +685,34 @@ def reindex_axis(self, new_axis, method=None, axis=0): new_axes[axis] = new_axis return BlockManager(new_blocks, new_axes) - def reindex_indexer(self, new_axis, indexer, axis=1): - """ - pandas-indexer with -1's only - """ - if axis == 0: - raise NotImplementedError + def _reindex_indexer_items(self, new_items, indexer): + # TODO: less efficient than I'd like + + item_order = com.take_1d(self.items.values, indexer) + + # keep track of what items aren't found anywhere + mask = np.zeros(len(item_order), dtype=bool) - new_axes = list(self.axes) - new_axes[axis] = new_axis new_blocks = [] for blk in self.blocks: - new_values = com.take_fast(blk.values, indexer, None, - False, axis=axis) - newb = make_block(new_values, blk.items, self.items) - new_blocks.append(newb) + blk_indexer = blk.items.get_indexer(item_order) + selector = blk_indexer != -1 + # update with observed items + mask |= selector + + new_block_items = new_items.take(selector.nonzero()[0]) + new_values = com.take_fast(blk.values, blk_indexer[selector], + None, False, axis=0) + new_blocks.append(make_block(new_values, new_block_items, + new_items)) + + if not mask.all(): + na_items = new_items[-mask] + na_block = self._make_na_block(na_items, new_items) + new_blocks.append(na_block) + new_blocks = _consolidate(new_blocks, new_items) - return BlockManager(new_blocks, new_axes) + return BlockManager(new_blocks, [new_items] + self.axes[1:]) def reindex_items(self, new_items): """ @@ -707,20 +736,20 @@ def reindex_items(self, new_items): if mask.any(): extra_items = new_items[mask] - - block_shape = list(self.shape) - block_shape[0] = len(extra_items) - block_values = np.empty(block_shape, dtype=np.float64) - block_values.fill(nan) - na_block = make_block(block_values, extra_items, new_items, - do_integrity_check=True) + na_block = self._make_na_block(extra_items, new_items) new_blocks.append(na_block) new_blocks = _consolidate(new_blocks, new_items) - new_axes = list(self.axes) - new_axes[0] = new_items + return BlockManager(new_blocks, [new_items] + self.axes[1:]) - return BlockManager(new_blocks, new_axes) + def _make_na_block(self, items, ref_items): + block_shape = list(self.shape) + block_shape[0] = len(items) + block_values = np.empty(block_shape, dtype=np.float64) + block_values.fill(nan) + na_block = make_block(block_values, items, ref_items, + do_integrity_check=True) + return na_block def take(self, indexer, axis=1): if axis == 0: @@ -871,6 +900,17 @@ def block_id_vector(self): assert((result >= 0).all()) return result + @property + def item_dtypes(self): + result = np.empty(len(self.items), dtype='O') + mask = np.zeros(len(self.items), dtype=bool) + for i, blk in enumerate(self.blocks): + indexer = self.items.get_indexer(blk.items) + result.put(indexer, blk.values.dtype.name) + mask.put(indexer, 1) + assert(mask.all()) + return result + def form_blocks(data, axes): # pre-filter out items if we passed it items = axes[0] @@ -1242,3 +1282,16 @@ def _upcast_blocks(blocks): # use any ref_items return _consolidate(new_blocks, newb.ref_items) +def _make_block_indexers(blocks, indexer, block_ids, block_locs, block_dtypes, + ref_items): + counts = defaultdict(int) + for dtype_name in block_dtypes.take(indexer): + counts[dtype_name] += 1 + + findexer = np.empty(counts['float64'], dtype='i4') + bindexer = np.empty(counts['bool'], dtype='i4') + oindexer = np.empty(counts['object'], dtype='i4') + iindexer = np.empty(counts['int64'], dtype='i4') + + for idx in indexer: + pass diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 8d67c25836..0c132a8b91 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -13,8 +13,8 @@ from pandas.core.internals import BlockManager, make_block, form_blocks from pandas.core.frame import DataFrame, _union_indexes from pandas.core.generic import NDFrame -from pandas.core.series import Series from pandas.util import py3compat +from pandas.util.decorators import deprecate import pandas.core.common as com import pandas._tseries as lib @@ -90,7 +90,7 @@ def f(self, other): 'done with scalar values') return self._combine(other, func) - + f.__name__ = name return f def _panel_arith_method(op, name): @@ -106,7 +106,7 @@ def f(self, other, axis='items'): Returns ------- - LongPanel + Panel """ return self._combine(other, op, axis=axis) @@ -514,14 +514,6 @@ def _slice(self, slobj, axis=0): def __setitem__(self, key, value): _, N, K = self.shape - - # XXX - if isinstance(value, LongPanel): - if len(value.items) != 1: - raise ValueError('Input panel must have only one item!') - - value = value.to_wide()[value.items[0]] - if isinstance(value, DataFrame): value = value.reindex(index=self.major_axis, columns=self.minor_axis) @@ -659,13 +651,12 @@ def reindex_like(self, other, method=None): minor=other.minor_axis, method=method) def _combine(self, other, func, axis=0): - if isinstance(other, (Panel, LongPanel)): + if isinstance(other, Panel): return self._combine_panel(other, func) elif isinstance(other, DataFrame): return self._combine_frame(other, func, axis=axis) elif np.isscalar(other): new_values = func(self.values, other) - return Panel(new_values, self.items, self.major_axis, self.minor_axis) @@ -691,15 +682,11 @@ def _combine_frame(self, other, func, axis=0): self.minor_axis) def _combine_panel(self, other, func): - if isinstance(other, LongPanel): - other = other.to_wide() - items = self.items + other.items major = self.major_axis + other.major_axis minor = self.minor_axis + other.minor_axis # could check that everything's the same size, but forget it - this = self.reindex(items=items, major=major, minor=minor) other = other.reindex(items=items, major=major, minor=minor) @@ -853,9 +840,9 @@ def swapaxes(self, axis1='major', axis2='minor'): return Panel(new_values, *new_axes) - def to_long(self, filter_observations=True): + def to_frame(self, filter_observations=True): """ - Transform wide format into long (stacked) format + Transform wide format into long (stacked) format as DataFrame Parameters ---------- @@ -865,7 +852,7 @@ def to_long(self, filter_observations=True): Returns ------- - y : LongPanel + y : DataFrame """ I, N, K = self.shape @@ -889,11 +876,13 @@ def to_long(self, filter_observations=True): minor_labels = minor_labels.ravel()[selector] index = MultiIndex(levels=[self.major_axis, self.minor_axis], - labels=[major_labels, minor_labels]) + labels=[major_labels, minor_labels], + names=['major', 'minor']) - return LongPanel(data, index=index, columns=self.items) + return DataFrame(data, index=index, columns=self.items) - toLong = to_long + to_long = deprecate('to_long', to_frame) + toLong = deprecate('toLong', to_frame) def filter(self, items): """ @@ -1170,373 +1159,65 @@ def _get_join_index(self, other, how): return join_major, join_minor WidePanel = Panel +LongPanel = DataFrame -#------------------------------------------------------------------------------- -# LongPanel and friends - - -class LongPanel(DataFrame): +def make_dummies(frame, item): """ - Represents long or "stacked" format panel data + Use unique values in column of panel to construct DataFrame containing + dummy variables in the columns (constructed from the unique values) Parameters ---------- - values : ndarray (N x K) - items : sequence - index : MultiIndex - - Note - ---- - LongPanel will likely disappear in a future release in favor of just using - DataFrame objects with hierarchical indexes. You should be careful about - writing production code depending on LongPanel - """ - - @property - def consistent(self): - offset = max(len(self.major_axis), len(self.minor_axis)) - - major_labels = self.major_labels - minor_labels = self.minor_labels - - # overflow risk - if (offset + 1) ** 2 > 2**32: # pragma: no cover - major_labels = major_labels.astype(np.int64) - minor_labels = minor_labels.astype(np.int64) - - keys = major_labels * offset + minor_labels - unique_keys = np.unique(keys) - - if len(unique_keys) < len(keys): - return False - - return True - - @property - def wide_shape(self): - return (len(self.items), len(self.major_axis), len(self.minor_axis)) - - @property - def items(self): - return self.columns - - @property - def _constructor(self): - return LongPanel - - def __len__(self): - return len(self.index) - - def __repr__(self): - return DataFrame.__repr__(self) - - @classmethod - def fromRecords(cls, data, major_field, minor_field, - exclude=None): - """ - Create LongPanel from DataFrame or record / structured ndarray - object - - Parameters - ---------- - data : DataFrame, structured or record array, or dict - major_field : string - minor_field : string - Name of field - exclude : list-like, default None - - Returns - ------- - LongPanel - """ - return cls.from_records(data, [major_field, minor_field], - exclude=exclude) - - def toRecords(self): - major = np.asarray(self.major_axis).take(self.major_labels) - minor = np.asarray(self.minor_axis).take(self.minor_labels) - - arrays = [major, minor] + list(self.values[:, i] - for i in range(len(self.items))) - - names = ['major', 'minor'] + list(self.items) - - return np.rec.fromarrays(arrays, names=names) - - @property - def major_axis(self): - return self.index.levels[0] - - @property - def minor_axis(self): - return self.index.levels[1] - - @property - def major_labels(self): - return self.index.labels[0] - - @property - def minor_labels(self): - return self.index.labels[1] - - def _combine(self, other, func, axis='items'): - if isinstance(other, LongPanel): - return self._combine_frame(other, func) - elif isinstance(other, DataFrame): - return self._combine_panel_frame(other, func, axis=axis) - elif isinstance(other, Series): - return self._combine_series(other, func, axis=axis) - elif np.isscalar(other): - return LongPanel(func(self.values, other), columns=self.items, - index=self.index) - else: # pragma: no cover - raise Exception('type %s not supported' % type(other)) - - def _combine_panel_frame(self, other, func, axis='items'): - """ - Arithmetic op - - Parameters - ---------- - other : DataFrame - func : function - axis : int / string - - Returns - ------- - y : LongPanel - """ - wide = self.to_wide() - result = wide._combine_frame(other, func, axis=axis) - return result.to_long() - - add = _panel_arith_method(operator.add, 'add') - subtract = sub = _panel_arith_method(operator.sub, 'subtract') - multiply = mul = _panel_arith_method(operator.mul, 'multiply') - - try: - divide = div = _panel_arith_method(operator.div, 'divide') - except AttributeError: # pragma: no cover - # Python 3 - divide = div = _panel_arith_method(operator.truediv, 'divide') - - def to_wide(self): - """ - Transform long (stacked) format into wide format - - Returns - ------- - Panel - """ - assert(self.consistent) - mask = make_mask(self.index) - if self._data.is_mixed_dtype(): - return self._to_wide_mixed(mask) - else: - return self._to_wide_homogeneous(mask) - - def _to_wide_homogeneous(self, mask): - values = np.empty(self.wide_shape, dtype=self.values.dtype) - - if not issubclass(self.values.dtype.type, np.integer): - values.fill(np.nan) - - for i in xrange(len(self.items)): - values[i].flat[mask] = self.values[:, i] - - return Panel(values, self.items, self.major_axis, self.minor_axis) - - def _to_wide_mixed(self, mask): - _, N, K = self.wide_shape - - # TODO: make much more efficient - - data = {} - for i, item in enumerate(self.items): - item_vals = self[item].values - - values = np.empty((N, K), dtype=item_vals.dtype) - values.ravel()[mask] = item_vals - data[item] = DataFrame(values, index=self.major_axis, - columns=self.minor_axis) - return Panel.from_dict(data) - - def toCSV(self, path): - def format_cols(items): - cols = ['Major', 'Minor'] + list(items) - return '"%s"' % '","'.join(cols) - - def format_row(major, minor, values): - vals = ','.join('%.12f' % val for val in values) - return '%s,%s,%s' % (major, minor, vals) - - f = open(path, 'w') - self._textConvert(f, format_cols, format_row) - f.close() - - def _textConvert(self, buf, format_cols, format_row): - print >> buf, format_cols(self.items) - - label_pairs = zip(self.major_axis.take(self.major_labels), - self.minor_axis.take(self.minor_labels)) - for i, (major, minor) in enumerate(label_pairs): - row = format_row(major, minor, self.values[i]) - print >> buf, row - - def swapaxes(self): - """ - Swap major and minor axes and reorder values to be grouped by - minor axis values - - Returns - ------- - LongPanel (new object) - """ - # Order everything by minor labels. Have to use mergesort - # because NumPy quicksort is not stable. Here of course I'm - # using the property that the major labels are ordered. - indexer = self.minor_labels.argsort(kind='mergesort') - - new_major = self.minor_labels.take(indexer) - new_minor = self.major_labels.take(indexer) - new_values = self.values.take(indexer, axis=0) - - new_index = MultiIndex(levels=[self.minor_axis, self.major_axis], - labels=[new_major, new_minor]) - - return LongPanel(new_values, columns=self.items, - index=new_index) - - def truncate(self, before=None, after=None): - """ - Slice panel between two major axis values, return complete LongPanel - - Parameters - ---------- - before : type of major_axis values or None, default None - None defaults to start of panel - - after : type of major_axis values or None, default None - None defaults to end of panel - - Returns - ------- - LongPanel - """ - left, right = self.index.slice_locs(before, after) - new_index = self.index.truncate(before, after) - - return LongPanel(self.values[left : right], - columns=self.items, index=new_index) - - def get_axis_dummies(self, axis='minor', transform=None, - prefix=None): - """ - Construct 1-0 dummy variables corresponding to designated axis - labels - - Parameters - ---------- - axis : {'major', 'minor'}, default 'minor' - transform : function, default None - - Function to apply to axis labels first. For example, to - get "day of week" dummies in a time series regression you might - call: - - panel.get_axis_dummies(axis='major', - transform=lambda d: d.weekday()) - Returns - ------- - LongPanel, item names taken from chosen axis - """ - if axis == 'minor': - dim = len(self.minor_axis) - items = self.minor_axis - labels = self.minor_labels - elif axis == 'major': - dim = len(self.major_axis) - items = self.major_axis - labels = self.major_labels - else: # pragma: no cover - raise ValueError('Do not recognize axis %s' % axis) - - if transform: - mapped = np.array([transform(val) for val in items]) - - items = np.array(sorted(set(mapped))) - labels = Index(items).get_indexer(mapped[labels]) - dim = len(items) + item : object + Value in panel items Index - values = np.eye(dim, dtype=float) - values = values.take(labels, axis=0) - - result = LongPanel(values, columns=items, index=self.index) - - if prefix is None: - prefix = '' - - result = result.add_prefix(prefix) - - return result - - def get_dummies(self, item): - """ - Use unique values in column of panel to construct LongPanel - containing dummy - - Parameters - ---------- - item : object - Value in panel items Index - - Returns - ------- - LongPanel - """ - idx = self.items.indexMap[item] - values = self.values[:, idx] - - distinct_values = np.array(sorted(set(values))) - mapping = distinct_values.searchsorted(values) - - values = np.eye(len(distinct_values)) - - dummy_mat = values.take(mapping, axis=0) - - return LongPanel(dummy_mat, columns=distinct_values, - index=self.index) - - def mean(self, axis='major', broadcast=False): - return self.apply(lambda x: np.mean(x, axis=0), axis, broadcast) + Returns + ------- + dummies : DataFrame + """ + from pandas import Factor + factor = Factor(frame[item].values) + values = np.eye(len(factor.levels)) + dummy_mat = values.take(factor.labels, axis=0) + return DataFrame(dummy_mat, columns=factor.levels, index=frame.index) - def sum(self, axis='major', broadcast=False): - return self.apply(lambda x: np.sum(x, axis=0), axis, broadcast) +def make_axis_dummies(frame, axis='minor', transform=None): + """ + Construct 1-0 dummy variables corresponding to designated axis + labels - def apply(self, f, axis='major', broadcast=False): - """ - Aggregate over a particular axis + Parameters + ---------- + axis : {'major', 'minor'}, default 'minor' + transform : function, default None + Function to apply to axis labels first. For example, to + get "day of week" dummies in a time series regression you might + call: + make_axis_dummies(panel, axis='major', + transform=lambda d: d.weekday()) + Returns + ------- + dummies : DataFrame + Column names taken from chosen axis + """ + numbers = { + 'major' : 0, + 'minor' : 1 + } + num = numbers.get(axis, axis) - Parameters - ---------- - f : function - NumPy function to apply to each group - axis : {'major', 'minor'} + items = frame.index.levels[num] + labels = frame.index.labels[num] + if transform is not None: + mapped_items = items.map(transform) + factor = Factor(mapped_items.take(labels)) + labels = factor.labels + items = factor.levels - broadcast : boolean + values = np.eye(len(items), dtype=float) + values = values.take(labels, axis=0) - Returns - ------- - broadcast=True -> LongPanel - broadcast=False -> DataFrame - """ - try: - return self._apply_level(f, axis=axis, broadcast=broadcast) - except Exception: - # ufunc - new_values = f(self.values) - return LongPanel(new_values, columns=self.items, - index=self.index) + return DataFrame(values, columns=items, index=frame.index) def _prep_ndarray(values, copy=True): if not isinstance(values, np.ndarray): @@ -1619,16 +1300,5 @@ def _get_distinct_indexes(indexes): indexes = sorted(indexes, key=id) return [gp.next() for _, gp in groupby(indexes, id)] -def make_mask(index): - """ - Create observation selection vector using major and minor - labels, for converting to wide format. - """ - N, K = index.levshape - selector = index.labels[1] + K * index.labels[0] - mask = np.zeros(N * K, dtype=bool) - mask.put(selector, True) - return mask - def _monotonic(arr): return not (arr[1:] < arr[:-1]).any() diff --git a/pandas/core/series.py b/pandas/core/series.py index 9273c6499d..cefe031faa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -16,13 +16,13 @@ _default_index, _maybe_upcast, _asarray_tuplesafe) from pandas.core.daterange import DateRange -from pandas.core.generic import PandasObject from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import _SeriesIndexer, _maybe_droplevels from pandas.util import py3compat from pandas.util.terminal import get_terminal_size import pandas.core.common as com import pandas.core.datetools as datetools +import pandas.core.generic as generic import pandas.core.nanops as nanops import pandas._tseries as lib import pandas._engines as _gin @@ -129,7 +129,7 @@ def _add_stat_doc(f, name, shortname, na_action=_doc_exclude_na, #------------------------------------------------------------------------------- # Series class -class Series(np.ndarray, PandasObject): +class Series(np.ndarray, generic.PandasObject): _AXIS_NUMBERS = { 'index' : 0 } @@ -1561,6 +1561,8 @@ def take(self, indices, axis=0): new_values = self.values.take(indices) return Series(new_values, index=new_index, name=self.name) + truncate = generic.truncate + def fillna(self, value=None, method='pad'): """ Fill NA/NaN values using the specified method diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c3b81652cd..59e2a4aee8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -9,8 +9,7 @@ import time import numpy as np -from pandas import (Series, TimeSeries, DataFrame, Panel, LongPanel, - Index, MultiIndex) +from pandas import Series, TimeSeries, DataFrame, Panel, Index, MultiIndex from pandas.core.common import adjoin import pandas.core.common as com import pandas._tseries as lib @@ -20,8 +19,7 @@ Series : 'series', TimeSeries : 'series', DataFrame : 'frame', - Panel : 'wide', - LongPanel : 'long' + Panel : 'wide' } _NAME_MAP = { @@ -32,7 +30,6 @@ 'wide' : 'Panel', 'wide_table' : 'Panel (Table)', 'long' : 'LongPanel', - # legacy h5 files 'Series' : 'Series', 'TimeSeries' : 'TimeSeries', @@ -244,7 +241,7 @@ def put(self, key, value, table=False, append=False, Parameters ---------- key : object - value : {Series, DataFrame, Panel, LongPanel} + value : {Series, DataFrame, Panel} table : boolean, default False Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching / selecting subsets of @@ -294,7 +291,7 @@ def append(self, key, value): Parameters ---------- key : object - value : {Series, DataFrame, Panel, LongPanel} + value : {Series, DataFrame, Panel} Notes ----- @@ -340,6 +337,22 @@ def _write_frame(self, group, df): def _read_frame(self, group, where=None): return DataFrame(self._read_block_manager(group)) + def _write_long(self, group, panel): + if len(panel.values) == 0: + raise ValueError('Can not write empty structure, data length was 0') + self._write_block_manager(group, panel._data) + + def _read_long(self, group, where=None): + items = self._read_index(group, 'items') + major_axis = self._read_index(group, 'major_axis') + minor_axis = self._read_index(group, 'minor_axis') + major_labels = _read_array(group, 'major_labels') + minor_labels = _read_array(group, 'minor_labels') + values = _read_array(group, 'values') + index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels]) + return DataFrame(values, index=index, columns=items) + def _write_block_manager(self, group, data): if not data.is_consolidated(): data = data.consolidate() @@ -404,31 +417,6 @@ def _write_wide_table(self, group, panel, append=False, comp=None): def _read_wide_table(self, group, where=None): return self._read_panel_table(group, where) - def _write_long(self, group, panel, append=False): - if len(panel.values) == 0: - raise ValueError('Can not write empty structure, data length was 0') - - self._write_index(group, 'major_axis', panel.major_axis) - self._write_index(group, 'minor_axis', panel.minor_axis) - self._write_index(group, 'items', panel.items) - self._write_array(group, 'major_labels', panel.major_labels) - self._write_array(group, 'minor_labels', panel.minor_labels) - self._write_array(group, 'values', panel.values) - - def _read_long(self, group, where=None): - from pandas.core.index import MultiIndex - - items = self._read_index(group, 'items') - major_axis = self._read_index(group, 'major_axis') - minor_axis = self._read_index(group, 'minor_axis') - major_labels = _read_array(group, 'major_labels') - minor_labels = _read_array(group, 'minor_labels') - values = _read_array(group, 'values') - - index = MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels]) - return LongPanel(values, index=index, columns=items) - def _write_index(self, group, key, index): if len(index) == 0: raise ValueError('Can not write empty structure, axis length was 0') @@ -663,12 +651,12 @@ def _read_panel_table(self, group, where=None): table._v_attrs.index_kind) # reconstruct long_index = MultiIndex.from_arrays([index, columns]) - lp = LongPanel(sel.values['values'], index=long_index, + lp = DataFrame(sel.values['values'], index=long_index, columns=fields) - if lp.consistent: + if not long_index.has_duplicates: lp = lp.sortlevel(level=0) - wp = lp.to_wide() + wp = lp.to_panel() else: if not self._quiet: # pragma: no cover print ('Duplicate entries in table, taking most recently ' @@ -686,8 +674,8 @@ def _read_panel_table(self, group, where=None): new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) - lp = LongPanel(new_values, index=new_index, columns=lp.columns) - wp = lp.to_wide() + lp = DataFrame(new_values, index=new_index, columns=lp.columns) + wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 9980c99430..88aadbefe1 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -5,8 +5,7 @@ import numpy as np -from pandas import (Series, DataFrame, Panel, LongPanel, DateRange, - MultiIndex) +from pandas import Series, DataFrame, Panel, DateRange, MultiIndex from pandas.io.pytables import HDFStore import pandas.util.testing as tm @@ -345,14 +344,13 @@ def test_wide_table_dups(self): def test_long(self): def _check(left, right): - tm.assert_panel_equal(left.to_wide(), - right.to_wide()) + tm.assert_panel_equal(left.to_panel(), right.to_panel()) wp = tm.makePanel() - self._check_roundtrip(wp.to_long(), _check) + self._check_roundtrip(wp.to_frame(), _check) # empty - self.assertRaises(ValueError, self._check_roundtrip, wp.to_long()[:0], + self.assertRaises(ValueError, self._check_roundtrip, wp.to_frame()[:0], _check) def test_longpanel(self): diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index a168e516a8..c4544eb143 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -13,7 +13,6 @@ from pandas.core.series import Series from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray, _default_index) -from pandas.core.panel import LongPanel import pandas.core.datetools as datetools from pandas.sparse.series import SparseSeries @@ -377,11 +376,11 @@ def xs(self, key, axis=0, copy=False): #---------------------------------------------------------------------- # Arithmetic-related methods - def _combine_frame(self, other, func, fill_value=None): + def _combine_frame(self, other, func, fill_value=None, level=None): new_index = self.index.union(other.index) new_columns = self.columns.union(other.columns) - if fill_value is not None: + if fill_value is not None or level is not None: raise NotImplementedError this = self @@ -457,7 +456,10 @@ def _combine_const(self, other, func): return self._constructor(data=new_data, index=self.index, columns=self.columns) - def _reindex_index(self, index, method, copy): + def _reindex_index(self, index, method, copy, level): + if level is not None: + raise Exception('Reindex by level not supported for sparse') + if self.index.equals(index): if copy: return self.copy() @@ -484,7 +486,10 @@ def _reindex_index(self, index, method, copy): return SparseDataFrame(new_series, index=index, columns=self.columns, default_fill_value=self.default_fill_value) - def _reindex_columns(self, columns, copy): + def _reindex_columns(self, columns, copy, level): + if level is not None: + raise Exception('Reindex by level not supported for sparse') + # TODO: fill value handling sdict = dict((k, v) for k, v in self.iteritems() if k in columns) return SparseDataFrame(sdict, index=self.index, columns=columns, @@ -682,7 +687,7 @@ def stack_sparse_frame(frame): index = MultiIndex(levels=[frame.index, frame.columns], labels=[major_labels, minor_labels]) - lp = LongPanel(stacked_values.reshape((nobs, 1)), index=index, + lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, columns=['foo']) return lp.sortlevel(level=0) diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py index 95890fda5c..f39ef6a45e 100644 --- a/pandas/sparse/panel.py +++ b/pandas/sparse/panel.py @@ -10,10 +10,11 @@ from pandas.core.common import _pickle_array, _unpickle_array, _mut_exclusive from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.frame import DataFrame -from pandas.core.panel import Panel, LongPanel +from pandas.core.panel import Panel from pandas.sparse.frame import SparseDataFrame +from pandas.util.decorators import deprecate class SparsePanelAxis(object): @@ -219,13 +220,13 @@ def copy(self): default_fill_value=self.default_fill_value, default_kind=self.default_kind) - def to_long(self, filter_observations=True): + def to_frame(self, filter_observations=True): """ - Convert SparsePanel to (dense) LongPanel + Convert SparsePanel to (dense) DataFrame Returns ------- - lp : LongPanel + frame : DataFrame """ if not filter_observations: raise Exception('filter_observations=False not supported for ' @@ -266,8 +267,11 @@ def to_long(self, filter_observations=True): index = MultiIndex(levels=[self.major_axis, self.minor_axis], labels=[major_labels, minor_labels]) - lp = LongPanel(values, index=index, columns=self.items) - return lp.sortlevel(level=0) + df = DataFrame(values, index=index, columns=self.items) + return df.sortlevel(level=0) + + to_long = deprecate('to_long', to_frame) + toLong = deprecate('toLong', to_frame) def reindex(self, major=None, items=None, minor=None, major_axis=None, minor_axis=None, copy=False): @@ -361,8 +365,6 @@ def _new_like(self, new_frames): default_kind=self.default_kind) def _combinePanel(self, other, func): - # if isinstance(other, LongPanel): - # other = other.to_wide() items = self.items + other.items major = self.major_axis + other.major_axis minor = self.minor_axis + other.minor_axis diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 09f30c87f3..1339392e19 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -1115,7 +1115,7 @@ def _check(frame): dense_frame = frame.to_dense() wp = Panel.from_dict({'foo' : frame}) - from_dense_lp = wp.to_long() + from_dense_lp = wp.to_frame() from_sparse_lp = spf.stack_sparse_frame(frame) @@ -1273,24 +1273,21 @@ def test_to_dense(self): dwp2 = Panel.from_dict(self.data_dict) assert_panel_equal(dwp, dwp2) - def test_to_long(self): + def test_to_frame(self): def _compare_with_dense(panel): - slp = panel.to_long() - dlp = panel.to_dense().to_long() + slp = panel.to_frame() + dlp = panel.to_dense().to_frame() self.assert_(np.array_equal(slp.values, dlp.values)) - self.assert_(np.array_equal(slp.major_labels, - dlp.major_labels)) - self.assert_(np.array_equal(slp.minor_labels, - dlp.minor_labels)) + self.assert_(slp.index.equals(dlp.index)) _compare_with_dense(self.panel) _compare_with_dense(self.panel.reindex(items=['ItemA'])) zero_panel = SparsePanel(self.data_dict, default_fill_value=0) - self.assertRaises(Exception, zero_panel.to_long) + self.assertRaises(Exception, zero_panel.to_frame) - self.assertRaises(Exception, self.panel.to_long, + self.assertRaises(Exception, self.panel.to_frame, filter_observations=False) def test_long_to_wide_sparse(self): @@ -1382,7 +1379,7 @@ def _dense_comp(op): _dense_comp(op5) # TODO: this case not yet supported! - # op6 = lambda x: x.add(x.to_long()) + # op6 = lambda x: x.add(x.to_frame()) # _dense_comp(op6) _check_ops(self.panel) diff --git a/pandas/src/internals.pyx b/pandas/src/internals.pyx new file mode 100644 index 0000000000..ebf47b9e3a --- /dev/null +++ b/pandas/src/internals.pyx @@ -0,0 +1,13 @@ +def get_reverse_indexer(ndarray[int32_t] indexer, Py_ssize_t length): + cdef: + Py_ssize_t i, n = len(indexer) + ndarray[int32_t] rev_indexer + int32_t idx + + rev_indexer = np.empty(length, dtype='i4') + for i in range(n): + idx = indexer[i] + if idx != -1: + rev_indexer[idx] = i + + return rev_indexer diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index dd7a1d2516..4875215d7e 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -499,3 +499,4 @@ include "reduce.pyx" include "stats.pyx" include "properties.pyx" include "inference.pyx" +include "internals.pyx" diff --git a/pandas/stats/fama_macbeth.py b/pandas/stats/fama_macbeth.py index 8640ded235..799e0f4d62 100644 --- a/pandas/stats/fama_macbeth.py +++ b/pandas/stats/fama_macbeth.py @@ -39,7 +39,7 @@ def __init__(self, y, x, intercept=True, nw_lags=None, time_effects=time_effects, x_effects=x_effects, cluster=cluster, dropped_dummies=dropped_dummies, verbose=verbose) - self._cols = self._ols_result._x.items + self._cols = self._ols_result._x.columns @cache_readonly def _beta_raw(self): diff --git a/pandas/stats/interface.py b/pandas/stats/interface.py index c0fed442e0..603d3b8289 100644 --- a/pandas/stats/interface.py +++ b/pandas/stats/interface.py @@ -1,6 +1,4 @@ -from pandas.core.api import (Series, DataFrame, Panel, LongPanel, - MultiIndex) - +from pandas.core.api import Series, DataFrame, Panel, MultiIndex from pandas.stats.ols import OLS, MovingOLS from pandas.stats.plm import PanelOLS, MovingPanelOLS, NonPooledPanelOLS import pandas.stats.common as common @@ -15,15 +13,14 @@ def ols(**kwargs): y : Series, x : DataFrame -> OLS y : Series, x : dict of DataFrame -> OLS y : DataFrame, x : DataFrame -> PanelOLS - y : DataFrame, x : dict of DataFrame/Panel/LongPanel -> PanelOLS - y : Series with MultiIndex, x : Panel/LongPanel -> PanelOLS + y : DataFrame, x : dict of DataFrame/Panel -> PanelOLS + y : Series with MultiIndex, x : Panel/DataFrame + MultiIndex -> PanelOLS Parameters ---------- y: Series or DataFrame See above for types - x: Series, DataFrame, dict of Series, dict of DataFrame, Panel, or - LongPanel + x: Series, DataFrame, dict of Series, dict of DataFrame, Panel weights : Series or ndarray The weights are presumed to be (proportional to) the inverse of the variance of the observations. That is, if the variables are to be @@ -110,7 +107,7 @@ def ols(**kwargs): if isinstance(y, DataFrame) or (isinstance(y, Series) and isinstance(y.index, MultiIndex)): panel = True - if isinstance(x, (Panel, LongPanel)): + if isinstance(x, Panel): panel = True if window_type == 'full_sample': diff --git a/pandas/stats/ols.py b/pandas/stats/ols.py index 3fea01661d..d7d3d26e20 100644 --- a/pandas/stats/ols.py +++ b/pandas/stats/ols.py @@ -11,7 +11,7 @@ from pandas.core.api import DataFrame, Series from pandas.core.index import MultiIndex -from pandas.core.panel import Panel, LongPanel +from pandas.core.panel import Panel from pandas.util.decorators import cache_readonly import pandas.stats.common as common import pandas.stats.math as math @@ -32,6 +32,8 @@ class OLS(object): nw_lags: None or int Number of Newey-West lags. """ + _panel_model = False + def __init__(self, y, x, intercept=True, weights=None, nw_lags=None, nw_overlap=False): import scikits.statsmodels.api as sm @@ -753,7 +755,7 @@ def _cum_xx(self, x): cum_xx = [] slicer = lambda df, dt: df.truncate(dt, dt).values - if isinstance(x, DataFrame) and not isinstance(x, LongPanel): + if not self._panel_model: _get_index = x.index.get_loc def slicer(df, dt): i = _get_index(dt) @@ -778,7 +780,7 @@ def _cum_xy(self, x, y): cum_xy = [] x_slicer = lambda df, dt: df.truncate(dt, dt).values - if isinstance(x, DataFrame) and not isinstance(x, LongPanel): + if not self._panel_model: _get_index = x.index.get_loc def x_slicer(df, dt): i = _get_index(dt) diff --git a/pandas/stats/plm.py b/pandas/stats/plm.py index 60bf00b7bf..fdf0ddd25b 100644 --- a/pandas/stats/plm.py +++ b/pandas/stats/plm.py @@ -10,12 +10,12 @@ import numpy as np -from pandas.core.panel import Panel, LongPanel +from pandas.core.panel import Panel from pandas.core.frame import DataFrame from pandas.core.series import Series from pandas.core.sparse import SparsePanel from pandas.stats.ols import OLS, MovingOLS -import pandas.stats.common as common +import pandas.stats.common as com import pandas.stats.math as math from pandas.util.decorators import cache_readonly @@ -24,6 +24,8 @@ class PanelOLS(OLS): See ols function docs """ + _panel_model = True + def __init__(self, y, x, weights=None, intercept=True, nw_lags=None, entity_effects=False, time_effects=False, x_effects=None, cluster=None, dropped_dummies=None, verbose=False, @@ -39,14 +41,14 @@ def __init__(self, y, x, weights=None, intercept=True, nw_lags=None, self._time_effects = time_effects self._x_effects = x_effects self._dropped_dummies = dropped_dummies or {} - self._cluster = common._get_cluster_type(cluster) + self._cluster = com._get_cluster_type(cluster) self._verbose = verbose (self._x, self._x_trans, self._x_filtered, self._y, self._y_trans) = self._prepare_data() - self._index = self._x.major_axis + self._index = self._x.index.levels[0] self._T = len(self._index) @@ -55,7 +57,7 @@ def log(self, msg): print msg def _prepare_data(self): - """Cleans and converts input data into LongPanel classes. + """Cleans and stacks input data into DataFrame objects If time effects is True, then we turn off intercepts and omit an item from every (entity and x) fixed effect. @@ -75,11 +77,11 @@ def _prepare_data(self): x_filtered = self._add_dummies(x_filtered, cat_mapping) if self._x_effects: - x = x.filter(x.items - self._x_effects) - x_filtered = x_filtered.filter(x_filtered.items - self._x_effects) + x = x.drop(self._x_effects, axis=1) + x_filtered = x_filtered.drop(self._x_effects, axis=1) if self._time_effects: - x_regressor = x.subtract(x.mean('minor', broadcast=True)) + x_regressor = x.sub(x.mean(level=0), level=0) unstacked_y = y.unstack() y_regressor = unstacked_y.sub(unstacked_y.mean(1), axis=0).stack() @@ -113,8 +115,8 @@ def _filter_data(self): data = self._x_orig cat_mapping = {} - if isinstance(data, LongPanel): - data = data.to_wide() + if isinstance(data, DataFrame): + data = data.to_panel() else: if isinstance(data, Panel): data = data.copy() @@ -131,9 +133,9 @@ def _filter_data(self): data['__weights__'] = self._weights # Filter x's without y (so we can make a prediction) - filtered = data.to_long() + filtered = data.to_frame() - # Filter all data together using to_long + # Filter all data together using to_frame # convert to DataFrame y = self._y_orig @@ -141,7 +143,7 @@ def _filter_data(self): y = y.unstack() data['__y__'] = y - data_long = data.to_long() + data_long = data.to_frame() x_filt = filtered.filter(x_names) x = data_long.filter(x_names) @@ -187,11 +189,11 @@ def _convert_x(self, x): def _add_dummies(self, panel, mapping): """ - Add entity and / or categorical dummies to input X LongPanel + Add entity and / or categorical dummies to input X DataFrame Returns ------- - LongPanel + DataFrame """ panel = self._add_entity_effects(panel) panel = self._add_categorical_dummies(panel, mapping) @@ -204,28 +206,30 @@ def _add_entity_effects(self, panel): Returns ------- - LongPanel + DataFrame """ + from pandas.core.panel import make_axis_dummies + if not self._entity_effects: return panel self.log('-- Adding entity fixed effect dummies') - dummies = panel.get_axis_dummies(axis='minor') + dummies = make_axis_dummies(panel, 'minor') if not self._use_all_dummies: if 'entity' in self._dropped_dummies: to_exclude = str(self._dropped_dummies.get('entity')) else: - to_exclude = dummies.items[0] + to_exclude = dummies.columns[0] - if to_exclude not in dummies.items: + if to_exclude not in dummies.columns: raise Exception('%s not in %s' % (to_exclude, - dummies.items)) + dummies.columns)) self.log('-- Excluding dummy for entity: %s' % to_exclude) - dummies = dummies.filter(dummies.items - [to_exclude]) + dummies = dummies.filter(dummies.columns - [to_exclude]) dummies = dummies.add_prefix('FE_') panel = panel.join(dummies) @@ -238,8 +242,10 @@ def _add_categorical_dummies(self, panel, cat_mappings): Returns ------- - LongPanel + DataFrame """ + from pandas.core.panel import make_dummies + if not self._x_effects: return panel @@ -248,7 +254,7 @@ def _add_categorical_dummies(self, panel, cat_mappings): for effect in self._x_effects: self.log('-- Adding fixed effect dummies for %s' % effect) - dummies = panel.get_dummies(effect) + dummies = make_dummies(panel, effect) val_map = cat_mappings.get(effect) if val_map: @@ -261,15 +267,15 @@ def _add_categorical_dummies(self, panel, cat_mappings): if val_map: mapped_name = val_map[to_exclude] else: - to_exclude = mapped_name = dummies.items[0] + to_exclude = mapped_name = dummies.columns[0] - if mapped_name not in dummies.items: # pragma: no cover + if mapped_name not in dummies.columns: # pragma: no cover raise Exception('%s not in %s' % (to_exclude, - dummies.items)) + dummies.columns)) self.log('-- Excluding dummy for %s: %s' % (effect, to_exclude)) - dummies = dummies.filter(dummies.items - [mapped_name]) + dummies = dummies.filter(dummies.columns - [mapped_name]) dropped_dummy = True dummies = _convertDummies(dummies, cat_mappings.get(effect)) @@ -299,7 +305,7 @@ def _beta_raw(self): @cache_readonly def beta(self): - return Series(self._beta_raw, index=self._x.items) + return Series(self._beta_raw, index=self._x.columns) @cache_readonly def _df_model_raw(self): @@ -391,10 +397,8 @@ def y_fitted(self): def _unstack_vector(self, vec, index=None): if index is None: index = self._y_trans.index - panel = LongPanel(vec.reshape((len(vec), 1)), index=index, - columns=['dummy']) - - return panel.to_wide()['dummy'] + panel = DataFrame(vec, index=index, columns=['dummy']) + return panel.to_panel()['dummy'] def _unstack_y(self, vec): unstacked = self._unstack_vector(vec) @@ -415,7 +419,7 @@ def _nobs(self): def _convertDummies(dummies, mapping): # cleans up the names of the generated dummies new_items = [] - for item in dummies.items: + for item in dummies.columns: if not mapping: var = str(item) if isinstance(item, float): @@ -426,7 +430,7 @@ def _convertDummies(dummies, mapping): # renames the dummies if a conversion dict is provided new_items.append(mapping[int(item)]) - dummies = LongPanel(dummies.values, index=dummies.index, + dummies = DataFrame(dummies.values, index=dummies.index, columns=new_items) return dummies @@ -444,7 +448,7 @@ def add_intercept(panel, name='intercept'): Parameters ---------- - panel: Panel (Long or Wide) + panel: Panel / DataFrame name: string, default 'intercept'] Returns @@ -461,6 +465,8 @@ class MovingPanelOLS(MovingOLS, PanelOLS): See ols function docs """ + _panel_model = True + def __init__(self, y, x, weights=None, window_type='expanding', window=None, min_periods=None, @@ -490,7 +496,7 @@ def __init__(self, y, x, weights=None, self._set_window(window_type, window, min_periods) if min_obs is None: - min_obs = len(self._x.items) + 1 + min_obs = len(self._x.columns) + 1 self._min_obs = min_obs @@ -544,7 +550,7 @@ def _var_beta_raw(self): x = self._x y = self._y - dates = x.major_axis + dates = x.index.levels[0] cluster_axis = None if self._cluster == 'time': @@ -630,7 +636,7 @@ def _enough_obs(self): # XXX: what's the best way to determine where to start? # TODO: write unit tests for this - rank_threshold = len(self._x.items) + 1 + rank_threshold = len(self._x.columns) + 1 if self._min_obs < rank_threshold: # pragma: no cover warnings.warn('min_obs is smaller than rank of X matrix') @@ -722,8 +728,6 @@ def __init__(self, y, x, window_type='full_sample', window=None, def _var_beta_panel(y, x, beta, xx, rmse, cluster_axis, nw_lags, nobs, df, nw_overlap): from pandas.core.frame import group_agg - from pandas.core.panel import LongPanel - xx_inv = math.inv(xx) yv = y.values @@ -740,12 +744,11 @@ def _var_beta_panel(y, x, beta, xx, rmse, cluster_axis, return np.dot(xx_inv, np.dot(xeps, xx_inv)) else: Xb = np.dot(x.values, beta).reshape((len(x.values), 1)) - resid = LongPanel(yv[:, None] - Xb, index=y.index, - columns=['resid']) + resid = DataFrame(yv[:, None] - Xb, index=y.index, columns=['resid']) if cluster_axis == 1: - x = x.swapaxes() - resid = resid.swapaxes() + x = x.swaplevel(0, 1).sortlevel(0) + resid = resid.swaplevel(0, 1).sortlevel(0) m = group_agg(x.values * resid.values, x.index._bounds, lambda x: np.sum(x, axis=0)) @@ -754,7 +757,7 @@ def _var_beta_panel(y, x, beta, xx, rmse, cluster_axis, nw_lags = 0 xox = 0 - for i in range(len(x.major_axis)): + for i in range(len(x.index.levels[0])): xox += math.newey_west(m[i : i + 1], nw_lags, nobs, df, nw_overlap) @@ -766,7 +769,7 @@ def _xx_time_effects(x, y): """ # X'X xx = np.dot(x.values.T, x.values) - xt = x.sum('minor').values + xt = x.sum(level=0).values count = y.unstack().count(1).values selector = count > 0 diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index 8f5ece7bf8..87de198d15 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -286,7 +286,7 @@ def test_y_predict(self): def test_longpanel_series_combo(self): wp = tm.makePanel() - lp = wp.to_long() + lp = wp.to_frame() y = lp.pop('ItemA') model = ols(y=y, x=lp, entity_effects=True, window=20) @@ -392,18 +392,18 @@ def testFiltering(self): result = ols(y=self.panel_y2, x=self.panel_x2) x = result._x - index = [x.major_axis[i] for i in x.major_labels] + index = x.index.get_level_values(0) index = Index(sorted(set(index))) exp_index = Index([datetime(2000, 1, 1), datetime(2000, 1, 3)]) - self.assertTrue(exp_index.equals(index)) + self.assertTrue;(exp_index.equals(index)) - index = [x.minor_axis[i] for i in x.minor_labels] + index = x.index.get_level_values(1) index = Index(sorted(set(index))) exp_index = Index(['A', 'B']) self.assertTrue(exp_index.equals(index)) x = result._x_filtered - index = [x.major_axis[i] for i in x.major_labels] + index = x.index.get_level_values(0) index = Index(sorted(set(index))) exp_index = Index([datetime(2000, 1, 1), datetime(2000, 1, 3), @@ -424,7 +424,7 @@ def testFiltering(self): [12, 21, 1]] assert_almost_equal(exp_x_filtered, result._x_filtered.values) - self.assertTrue(result._x_filtered.major_axis.equals( + self.assertTrue(result._x_filtered.index.levels[0].equals( result.y_fitted.index)) def test_wls_panel(self): @@ -496,26 +496,24 @@ def testWithXEffects(self): result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=['x1']) assert_almost_equal(result._y.values.flat, [1, 4, 5]) - exp_x = [[0, 0, 14, 1], [0, 1, 17, 1], [1, 0, 48, 1]] - assert_almost_equal(result._x.values, exp_x) - exp_index = Index(['x1_30', 'x1_9', 'x2', 'intercept']) - self.assertTrue(exp_index.equals(result._x.items)) - - # _check_non_raw_results(result) + res = result._x + exp_x = DataFrame([[0, 0, 14, 1], [0, 1, 17, 1], [1, 0, 48, 1]], + columns=['x1_30', 'x1_9', 'x2', 'intercept'], + index=res.index, dtype=float) + assert_frame_equal(res, exp_x.reindex(columns=res.columns)) def testWithXEffectsAndDroppedDummies(self): result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=['x1'], dropped_dummies={'x1' : 30}) + res = result._x assert_almost_equal(result._y.values.flat, [1, 4, 5]) - exp_x = [[1, 0, 14, 1], [0, 1, 17, 1], [0, 0, 48, 1]] - assert_almost_equal(result._x.values, exp_x) - - exp_index = Index(['x1_6', 'x1_9', 'x2', 'intercept']) - self.assertTrue(exp_index.equals(result._x.items)) + exp_x = DataFrame([[1, 0, 14, 1], [0, 1, 17, 1], [0, 0, 48, 1]], + columns=['x1_6', 'x1_9', 'x2', 'intercept'], + index=res.index, dtype=float) - # _check_non_raw_results(result) + assert_frame_equal(res, exp_x.reindex(columns=res.columns)) def testWithXEffectsAndConversion(self): result = ols(y=self.panel_y3, x=self.panel_x3, x_effects=['x1', 'x2']) @@ -526,7 +524,7 @@ def testWithXEffectsAndConversion(self): assert_almost_equal(result._x.values, exp_x) exp_index = Index(['x1_B', 'x1_C', 'x2_baz', 'x2_foo', 'intercept']) - self.assertTrue(exp_index.equals(result._x.items)) + self.assertTrue(exp_index.equals(result._x.columns)) # _check_non_raw_results(result) @@ -540,7 +538,7 @@ def testWithXEffectsAndConversionAndDroppedDummies(self): assert_almost_equal(result._x.values, exp_x) exp_index = Index(['x1_B', 'x1_C', 'x2_bar', 'x2_baz', 'intercept']) - self.assertTrue(exp_index.equals(result._x.items)) + self.assertTrue(exp_index.equals(result._x.columns)) # _check_non_raw_results(result) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 78d932ad8c..043c36597c 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1720,6 +1720,15 @@ def test_pop(self): def test_iter(self): self.assert_(tm.equalContents(list(self.frame), self.frame.columns)) + def test_iterrows(self): + for i, (k, v) in enumerate(self.frame.iterrows()): + exp = self.frame.xs(self.frame.index[i]) + assert_series_equal(v, exp) + + for i, (k, v) in enumerate(self.mixed_frame.iterrows()): + exp = self.mixed_frame.xs(self.mixed_frame.index[i]) + assert_series_equal(v, exp) + def test_len(self): self.assertEqual(len(self.frame), len(self.frame.index)) @@ -2611,8 +2620,8 @@ def test_pivot(self): # pivot multiple columns wp = tm.makePanel() - lp = wp.to_long() - df = DataFrame.from_records(lp.toRecords()) + lp = wp.to_frame() + df = lp.reset_index() assert_frame_equal(df.pivot('major', 'minor'), lp.unstack()) def test_pivot_duplicates(self): diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 7e304d797a..5123596ff9 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -923,16 +923,6 @@ def test_format(self): def test_bounds(self): self.index._bounds - def test_makeMask(self): - from pandas.core.panel import make_mask - - mask = make_mask(self.index) - expected = np.array([True, True, - True, False, - False, True, - True, True], dtype=bool) - self.assert_(np.array_equal(mask, expected)) - def test_equals(self): self.assert_(self.index.equals(self.index)) self.assert_(self.index.equal_levels(self.index)) @@ -1168,6 +1158,40 @@ def test_take_preserve_name(self): taken = self.index.take([3,0,1]) self.assertEqual(taken.names, self.index.names) + def test_join_level(self): + other = Index(['three', 'one', 'two']) + + def _check_how(how): + join_index, lidx, ridx = other.join(self.index, how=how, + level='second', + return_indexers=True) + + join_index2, ridx2, lidx2 = self.index.join(other, how=how, + level='second', + return_indexers=True) + + self.assert_(join_index.equals(join_index2)) + self.assert_(np.array_equal(lidx, lidx2)) + self.assert_(np.array_equal(ridx, ridx2)) + + exp_level = self.index.levels[1].join(other, how=how) + self.assert_(join_index.levels[0].equals(self.index.levels[0])) + self.assert_(join_index.levels[1].equals(exp_level)) + + _check_how('outer') + _check_how('inner') + _check_how('left') + _check_how('right') + + def test_has_duplicates(self): + self.assert_(not self.index.has_duplicates) + self.assert_(self.index.append(self.index).has_duplicates) + + index = MultiIndex(levels=[[0, 1], [0, 1, 2]], + labels=[[0, 0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 0, 1, 2]]) + self.assert_(index.has_duplicates) + class TestFactor(unittest.TestCase): def setUp(self): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 10575247e1..7bb48d0878 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -54,6 +54,33 @@ def test_append(self): result = a['A'].append(b['A']) tm.assert_series_equal(result, self.frame['A']) + def test_align_level(self): + # axis=0 + month_sums = self.ymd.sum(level='month') + result = month_sums.reindex(self.ymd.index, level=1) + expected = self.ymd.groupby(level='month').transform(np.sum) + + assert_frame_equal(result, expected) + + # axis=1 + month_sums = self.ymd.T.sum(axis=1, level='month') + result = month_sums.reindex(columns=self.ymd.index, level=1) + expected = self.ymd.groupby(level='month').transform(np.sum).T + assert_frame_equal(result, expected) + + def test_binops_level(self): + def _check_op(opname): + op = getattr(DataFrame, opname) + result = op(self.ymd, self.ymd.sum(level='month'), level='month') + broadcasted = self.ymd.groupby(level='month').transform(np.sum) + expected = op(self.ymd, broadcasted) + assert_frame_equal(result, expected) + + _check_op('sub') + _check_op('add') + _check_op('mul') + _check_op('div') + def test_pickle(self): import cPickle def _test_roundtrip(frame): @@ -191,6 +218,23 @@ def test_getitem_toplevel(self): assert_frame_equal(result, expected) assert_frame_equal(result, result2) + def test_getitem_slice_integers(self): + index = MultiIndex(levels=[[0, 1, 2], [0, 2]], + labels=[[0, 0, 1, 1, 2, 2], + [0, 1, 0, 1, 0, 1]]) + + frame = DataFrame(np.random.randn(len(index), 4), index=index, + columns=['a', 'b', 'c', 'd']) + res = frame.ix[1:2] + exp = frame[2:] + assert_frame_equal(res, exp) + + series = Series(np.random.randn(len(index)), index=index) + + res = series.ix[1:2] + exp = series[2:] + assert_series_equal(res, exp) + def test_getitem_int(self): levels = [[0, 1], [0, 1, 2]] labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 2c8932b91a..3f1e54994a 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -11,7 +11,7 @@ from pandas import DataFrame, Index, isnull, notnull, pivot, MultiIndex from pandas.core.datetools import bday from pandas.core.frame import group_agg -from pandas.core.panel import Panel, LongPanel +from pandas.core.panel import Panel from pandas.core.series import remove_na import pandas.core.common as com import pandas.core.panel as panelmod @@ -355,12 +355,8 @@ def test_delitem_and_pop(self): assert_frame_equal(panelc[0], panel[0]) def test_setitem(self): - # LongPanel with one item - lp = self.panel.filter(['ItemA']).to_long() - self.panel['ItemE'] = lp - - lp = self.panel.filter(['ItemA', 'ItemB']).to_long() + lp = self.panel.filter(['ItemA', 'ItemB']).to_frame() self.assertRaises(Exception, self.panel.__setitem__, 'ItemE', lp) @@ -825,11 +821,6 @@ def test_fillna(self): filled = empty.fillna(0) assert_panel_equal(filled, empty) - def test_combinePanel_with_long(self): - lng = self.panel.to_long(filter_observations=False) - result = self.panel.add(lng) - self.assert_panel_equal(result, self.panel * 2) - def test_swapaxes(self): result = self.panel.swapaxes('items', 'minor') self.assert_(result.items is self.panel.minor_axis) @@ -847,22 +838,22 @@ def test_swapaxes(self): # this should also work self.assertRaises(Exception, self.panel.swapaxes, 'items', 'items') - def test_to_long(self): + def test_to_frame(self): # filtered - filtered = self.panel.to_long() + filtered = self.panel.to_frame() # unfiltered - unfiltered = self.panel.to_long(filter_observations=False) + unfiltered = self.panel.to_frame(filter_observations=False) - assert_panel_equal(unfiltered.to_wide(), self.panel) + assert_panel_equal(unfiltered.to_panel(), self.panel) - def test_to_long_mixed(self): + def test_to_frame_mixed(self): panel = self.panel.fillna(0) panel['str'] = 'foo' panel['bool'] = panel['ItemA'] > 0 - lp = panel.to_long() - wp = lp.to_wide() + lp = panel.to_frame() + wp = lp.to_panel() self.assertEqual(wp['bool'].values.dtype, np.bool_) assert_frame_equal(wp['bool'], panel['bool']) @@ -966,107 +957,22 @@ def test_rename(self): self.assert_((self.panel['ItemA'].values == 3).all()) class TestLongPanel(unittest.TestCase): + """ + LongPanel no longer exists, but... + """ def setUp(self): panel = tm.makePanel() tm.add_nans(panel) - self.panel = panel.to_long() - self.unfiltered_panel = panel.to_long(filter_observations=False) - - def test_pickle(self): - import cPickle - - pickled = cPickle.dumps(self.panel) - unpickled = cPickle.loads(pickled) - - assert_almost_equal(unpickled['ItemA'].values, - self.panel['ItemA'].values) - - def test_len(self): - len(self.unfiltered_panel) - - def test_constructor(self): - pass - - def test_fromRecords_toRecords(self): - # structured array - K = 10 - - recs = np.zeros(K, dtype='O,O,f8,f8') - recs['f0'] = range(K // 2) * 2 - recs['f1'] = np.arange(K) / (K // 2) - recs['f2'] = np.arange(K) * 2 - recs['f3'] = np.arange(K) - - lp = LongPanel.fromRecords(recs, 'f0', 'f1') - self.assertEqual(len(lp.items), 2) - - lp = LongPanel.fromRecords(recs, 'f0', 'f1', exclude=['f2']) - self.assertEqual(len(lp.items), 1) - - torecs = lp.toRecords() - self.assertEqual(len(torecs.dtype.names), len(lp.items) + 2) - - # DataFrame - df = DataFrame.from_records(recs) - lp = LongPanel.fromRecords(df, 'f0', 'f1', exclude=['f2']) - self.assertEqual(len(lp.items), 1) - - # dict of arrays - series = DataFrame.from_records(recs)._series - lp = LongPanel.fromRecords(series, 'f0', 'f1', exclude=['f2']) - self.assertEqual(len(lp.items), 1) - self.assert_('f2' in series) - - self.assertRaises(Exception, LongPanel.fromRecords, np.zeros((3, 3)), - 0, 1) - - def test_factors(self): - # structured array - K = 10 - - recs = np.zeros(K, dtype='O,O,f8,f8,O,O') - recs['f0'] = ['one'] * 5 + ['two'] * 5 - recs['f1'] = ['A', 'B', 'C', 'D', 'E'] * 2 - recs['f2'] = np.arange(K) * 2 - recs['f3'] = np.arange(K) - recs['f4'] = ['A', 'B', 'C', 'D', 'E'] * 2 - recs['f5'] = ['foo', 'bar'] * 5 - - lp = LongPanel.fromRecords(recs, 'f0', 'f1') - - def test_columns(self): - self.assert_(np.array_equal(self.panel.items, self.panel.columns)) - - def test_copy(self): - thecopy = self.panel.copy() - self.assert_(np.array_equal(thecopy.values, self.panel.values)) - self.assert_(thecopy.values is not self.panel.values) - - def test_getitem(self): - col = self.panel['ItemA'] - - def test_setitem(self): - self.panel['ItemE'] = self.panel['ItemA'] - self.panel['ItemF'] = 1. - - wp = self.panel.to_wide() - assert_frame_equal(wp['ItemA'], wp['ItemE']) - - itemf = wp['ItemF'].values.ravel() - self.assert_((itemf[np.isfinite(itemf)] == 1).all()) - - # check exceptions raised - lp = self.panel.filter(['ItemA', 'ItemB']) - lp2 = self.panel.filter(['ItemC', 'ItemE']) - self.assertRaises(Exception, lp.__setitem__, 'foo', lp2) + self.panel = panel.to_frame() + self.unfiltered_panel = panel.to_frame(filter_observations=False) def test_ops_differently_indexed(self): # trying to set non-identically indexed panel - wp = self.panel.to_wide() + wp = self.panel.to_panel() wp2 = wp.reindex(major=wp.major_axis[:-1]) - lp2 = wp2.to_long() + lp2 = wp2.to_frame() result = self.panel + lp2 assert_frame_equal(result.reindex(lp2.index), lp2 * 2) @@ -1082,14 +988,14 @@ def test_ops_scalar(self): assert_frame_equal(result, expected) def test_combineFrame(self): - wp = self.panel.to_wide() - result = self.panel.add(wp['ItemA']) - assert_frame_equal(result.to_wide()['ItemA'], wp['ItemA'] * 2) + wp = self.panel.to_panel() + result = self.panel.add(wp['ItemA'].stack(), axis=0) + assert_frame_equal(result.to_panel()['ItemA'], wp['ItemA'] * 2) def test_combinePanel(self): - wp = self.panel.to_wide() + wp = self.panel.to_panel() result = self.panel.add(self.panel) - wide_result = result.to_wide() + wide_result = result.to_panel() assert_frame_equal(wp['ItemA'] * 2, wide_result['ItemA']) # one item @@ -1112,8 +1018,8 @@ def test_combine_series(self): assert_frame_equal(result, expected) def test_operators(self): - wp = self.panel.to_wide() - result = (self.panel + 1).to_wide() + wp = self.panel.to_panel() + result = (self.panel + 1).to_panel() assert_frame_equal(wp['ItemA'] + 1, result['ItemA']) def test_sort(self): @@ -1121,14 +1027,10 @@ def is_sorted(arr): return (arr[1:] > arr[:-1]).any() sorted_minor = self.panel.sortlevel(level=1) - self.assert_(is_sorted(sorted_minor.minor_labels)) + self.assert_(is_sorted(sorted_minor.index.labels[1])) sorted_major = sorted_minor.sortlevel(level=0) - self.assert_(is_sorted(sorted_major.major_labels)) - - def test_toCSV(self): - self.panel.toCSV('__tmp__') - os.remove('__tmp__') + self.assert_(is_sorted(sorted_major.index.labels[0])) def test_to_string(self): from cStringIO import StringIO @@ -1136,80 +1038,67 @@ def test_to_string(self): buf = StringIO() self.panel.to_string(buf) - def test_swapaxes(self): - swapped = self.panel.swapaxes() - - self.assert_(swapped.major_axis is self.panel.minor_axis) - - # what else to test here? - def test_truncate(self): - dates = self.panel.major_axis + dates = self.panel.index.levels[0] start, end = dates[1], dates[5] - trunced = self.panel.truncate(start, end).to_wide() - expected = self.panel.to_wide()['ItemA'].truncate(start, end) + trunced = self.panel.truncate(start, end).to_panel() + expected = self.panel.to_panel()['ItemA'].truncate(start, end) assert_frame_equal(trunced['ItemA'], expected) - trunced = self.panel.truncate(before=start).to_wide() - expected = self.panel.to_wide()['ItemA'].truncate(before=start) + trunced = self.panel.truncate(before=start).to_panel() + expected = self.panel.to_panel()['ItemA'].truncate(before=start) assert_frame_equal(trunced['ItemA'], expected) - trunced = self.panel.truncate(after=end).to_wide() - expected = self.panel.to_wide()['ItemA'].truncate(after=end) + trunced = self.panel.truncate(after=end).to_panel() + expected = self.panel.to_panel()['ItemA'].truncate(after=end) assert_frame_equal(trunced['ItemA'], expected) # truncate on dates that aren't in there - wp = self.panel.to_wide() + wp = self.panel.to_panel() new_index = wp.major_axis[::5] wp2 = wp.reindex(major=new_index) - lp2 = wp2.to_long() + lp2 = wp2.to_frame() lp_trunc = lp2.truncate(wp.major_axis[2], wp.major_axis[-2]) wp_trunc = wp2.truncate(wp.major_axis[2], wp.major_axis[-2]) - assert_panel_equal(wp_trunc, lp_trunc.to_wide()) + assert_panel_equal(wp_trunc, lp_trunc.to_panel()) # throw proper exception self.assertRaises(Exception, lp2.truncate, wp.major_axis[-2], wp.major_axis[2]) - - def test_filter(self): - pass - def test_axis_dummies(self): - minor_dummies = self.panel.get_axis_dummies('minor') - self.assertEqual(len(minor_dummies.items), - len(self.panel.minor_axis)) + minor_dummies = panelmod.make_axis_dummies(self.panel, 'minor') + self.assertEqual(len(minor_dummies.columns), + len(self.panel.index.levels[1])) - major_dummies = self.panel.get_axis_dummies('major') - self.assertEqual(len(major_dummies.items), - len(self.panel.major_axis)) + major_dummies = panelmod.make_axis_dummies(self.panel, 'major') + self.assertEqual(len(major_dummies.columns), + len(self.panel.index.levels[0])) mapping = {'A' : 'one', 'B' : 'one', 'C' : 'two', 'D' : 'two'} - transformed = self.panel.get_axis_dummies('minor', + transformed = panelmod.make_axis_dummies(self.panel, 'minor', transform=mapping.get) - self.assertEqual(len(transformed.items), 2) - self.assert_(np.array_equal(transformed.items, ['one', 'two'])) + self.assertEqual(len(transformed.columns), 2) + self.assert_(np.array_equal(transformed.columns, ['one', 'two'])) # TODO: test correctness def test_get_dummies(self): - self.panel['Label'] = self.panel.minor_labels - - minor_dummies = self.panel.get_axis_dummies('minor') - dummies = self.panel.get_dummies('Label') - + self.panel['Label'] = self.panel.index.labels[1] + minor_dummies = panelmod.make_axis_dummies(self.panel, 'minor') + dummies = panelmod.make_dummies(self.panel, 'Label') self.assert_(np.array_equal(dummies.values, minor_dummies.values)) def test_apply(self): @@ -1219,22 +1108,17 @@ def test_apply(self): np.sqrt(self.panel.values))) def test_mean(self): - means = self.panel.mean('major') + means = self.panel.mean(level='minor') # test versus Panel version - wide_means = self.panel.to_wide().mean('major') + wide_means = self.panel.to_panel().mean('major') assert_frame_equal(means, wide_means) - means_broadcast = self.panel.mean('major', broadcast=True) - self.assert_(isinstance(means_broadcast, LongPanel)) - - # how to check correctness? - def test_sum(self): - sums = self.panel.sum('major') + sums = self.panel.sum(level='minor') # test versus Panel version - wide_sums = self.panel.to_wide().sum('major') + wide_sums = self.panel.to_panel().sum('major') assert_frame_equal(sums, wide_sums) def test_count(self): @@ -1256,18 +1140,11 @@ def test_join(self): joined = lp1.join(lp2) - self.assertEqual(len(joined.items), 3) + self.assertEqual(len(joined.columns), 3) self.assertRaises(Exception, lp1.join, self.panel.filter(['ItemB', 'ItemC'])) - def test_merge(self): - pass - - def test_add_prefix(self): - lp = self.panel.add_prefix('foo#') - self.assertEqual(lp.items[0], 'foo#ItemA') - def test_pivot(self): from pandas.core.reshape import _slow_pivot diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 9b12288808..d1b0d318ac 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -440,9 +440,9 @@ def test_ix_getitem(self): assert_series_equal(self.series.ix[5::2], self.series[5::2]) # slice with indices - d1, d2 = self.series.index[[5, 15]] - result = self.series.ix[d1:d2] - expected = self.series.truncate(d1, d2) + d1, d2 = self.ts.index[[5, 15]] + result = self.ts.ix[d1:d2] + expected = self.ts.truncate(d1, d2) assert_series_equal(result, expected) # boolean @@ -450,8 +450,8 @@ def test_ix_getitem(self): assert_series_equal(self.series.ix[mask], self.series[mask]) # ask for index value - self.assertEquals(self.series.ix[d1], self.series[d1]) - self.assertEquals(self.series.ix[d2], self.series[d2]) + self.assertEquals(self.ts.ix[d1], self.ts[d1]) + self.assertEquals(self.ts.ix[d2], self.ts[d2]) def test_ix_getitem_iterator(self): idx = iter(self.series.index[:10]) @@ -1266,9 +1266,9 @@ def test_truncate(self): truncated = ts.truncate(before=self.ts.index[-1] + offset) assert(len(truncated) == 0) - truncated = ts.truncate(before=self.ts.index[-1] + offset, - after=self.ts.index[0] - offset) - assert(len(truncated) == 0) + self.assertRaises(Exception, ts.truncate, + before=self.ts.index[-1] + offset, + after=self.ts.index[0] - offset) def test_asof(self): self.ts[5:10] = np.NaN diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index db38e93c44..59297a132c 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -201,6 +201,11 @@ def _check(arr): _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) +def test_get_reverse_indexer(): + indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype='i4') + result = lib.get_reverse_indexer(indexer, 5) + expected = np.array([4, 2, 3, 6, 7], dtype='i4') + assert(np.array_equal(result, expected)) class TestMoments(unittest.TestCase): pass diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 94a4ff5dbe..65b726a9f5 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -222,11 +222,6 @@ def add_nans(panel): for j, col in enumerate(dm.columns): dm[col][:i + j] = np.NaN -def makeLongPanel(): - wp = makePanel() - add_nans(wp) - - return wp.to_long() # Dependency checks. Copied this from Nipy/Nipype (Copyright of # respective developers, license: BSD-3) diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py index 9e481d5a5a..3df763133d 100644 --- a/vb_suite/index_object.py +++ b/vb_suite/index_object.py @@ -3,3 +3,20 @@ SECTION = "Index / MultiIndex objects" + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# intersection, union + +setup = common_setup + """ +rng = DateRange('1/1/2000', periods=10000, offset=datetools.Minute()) +rng = rng.view(Index) +rng2 = rng[:-1] +""" + +index_datetime_intersection = Benchmark("rng.intersection(rng2)", setup, + name='index_datetime_intersection') +index_datetime_union = Benchmark("rng.union(rng2)", setup, + name='index_datetime_union')