序列的修改、散列和切片
此笔记记录于《流畅的 python》,大部分为其中的摘要,少部分为笔者自己的理解;笔记为 jupyter 转的 markdown,原始版 jupyter 笔记在这个仓库
此笔记记录于《流畅的 python》,大部分为其中的摘要,少部分为笔者自己的理解;笔记为 jupyter 转的 markdown,原始版 jupyter 笔记在这个仓库
不要检查它是不是鸭子:检查它的叫声像不像鸭子、它的走路姿势像不像鸭子,等等。具体检查什么取决于你想使用语言的哪些行为 m
from array import array
import reprlib
import math
class Vector:
typecode = 'd'
def __init__(self, components):
# “受保护的”实例属性,把 Vector 的分量保存在一个数组中,之前 2D 的只有 x 和 y,现在是任意维度的
self._components = array(self.typecode, components)
def __iter__(self):
return iter(self._components)
def __repr__(self):
components = reprlib.repr(self._components)
components = components[components.find('['):-1]
return 'Vector({})'.format(components)
def __str__(self):
return str(tuple(self))
def __bytes__(self):
return (bytes([ord(self.typecode)]) +
bytes(self._components))
def __eq__(self, other):
return tuple(self) == tuple(other)
def __abs__(self):
return math.sqrt(sum(x * x for x in self))
def __bool__(self):
return bool(abs(self))
@classmethod
def frombytes(cls, octets):
typecode = chr(octets[0])
memv = memoryview(octets[1:]).cast(typecode)
return cls(memv)
Vector([3.1, 4.2])
Vector([3.1, 4.2])
Vector((3, 4, 5))
Vector([3.0, 4.0, 5.0])
Vector(range(10))
Vector([0.0, 1.0, 2.0, 3.0, 4.0, ...])
调用repr()
函数的目的是调试,因此绝对不能抛出异常。如果__repr__
方法的实现有问题,那么必须处理,尽量输出有用的内容,让用户能够识别目标对象。
顺便说一下,我们本可以让 Vector 继承 Vector2d,但是我没这么做,原因有二:
- 其一,两个构造方法不兼容,因此不建议继承。这一点可以通过适当处理
__init__
方法的参数解决, - 不过第二个原因更重要:我想把 Vector 类当作单独的示例,以此实现序列协议。接下来,我们先讨论协议这个术语,然后实现序列协议。
协议和鸭子类型
在面向对象编程中,协议是非正式的接口,只在文档中定义,在代码中不定义。例如,Python 的序列协议只需要__len__
和__getitem__
两个方法。任何类(如 Spam),只要使用标准的签名和语义实现了这两个方法,就能用在任何期待序列的地方。Spam 是不是哪个类的子类无关紧要,只要提供了所需的方法即可。
import collections
Card = collections.namedtuple('Card', ['rank', 'suit'])
class FrenchDeck:
ranks = [str(n) for n in range(2, 11)]+list('JQKA')
suits = 'spades diamonds clubs hearts'.split()
def __init__(self):
self._cards = [Card(rank, suit) for suit in self.suits
for rank in self.ranks]
def __len__(self):
return len(self._cards)
def __getitem__(self, position):
return self._cards[position]
FrenchDeck 类能充分利用 Python 的很多功能,因为它实现了序列协议,不过代码中并没有声明这一点。任何有经验的 Python 程序员只要看一眼就知道它是序列,即便它是 object 的子类也无妨。我们说它是序列,因为它的行为像序列,这才是重点。
协议是非正式的,没有强制力,因此如果你知道类的具体使用场景,通常只需要实现一个协议的部分。例如,为了支持迭代,只需实现__getitem__
方法,没必要提供__len__
方法。
Vector 类第 2 版:可切片的序列
class Vector:
# 省略了很多行
# ...
def __len__(self):
return len(self._components)
def __getitem__(self, index):
return self._components[index]
上述添加了__len__
和__getitem__
方法后,就可以支持切片了
切片原理
class MySeq:
def __getitem__(self, index):
return index # __getitem__直接返回传给它的值。
s = MySeq()
s[1]
1
s[1:4]
slice(1, 4, None)
s[1:4:2]
slice(1, 4, 2)
s[1:4:2, 9] # 如果[]中有逗号,那么__getitem__收到的是元组。
(slice(1, 4, 2), 9)
s[1:4:2, 7:9] # 元组中甚至可以有多个切片对象
(slice(1, 4, 2), slice(7, 9, None))
查看 slice 类的属性
slice
slice
dir(slice)
['__class__',
'__delattr__',
'__dir__',
'__doc__',
'__eq__',
'__format__',
'__ge__',
'__getattribute__',
'__gt__',
'__hash__',
'__init__',
'__init_subclass__',
'__le__',
'__lt__',
'__ne__',
'__new__',
'__reduce__',
'__reduce_ex__',
'__repr__',
'__setattr__',
'__sizeof__',
'__str__',
'__subclasshook__',
'indices',
'start',
'step',
'stop']
help(slice.indices) # 给定一个长度,处理边界情况
Help on method_descriptor:
indices(...)
S.indices(len) -> (start, stop, stride)
Assuming a sequence of length len, calculate the start and stop
indices, and the stride length of the extended slice described by
S. Out of bounds indices are clipped in a manner consistent with the
handling of normal slices.
给定长度为 len 的序列,计算 S 表示的扩展切片的起始(start)和结尾(stop)索引,以及步幅(stride)。超出边界的索引会被截掉,这与常规切片的处理方式一样。
换句话说,indices 方法开放了内置序列实现的棘手逻辑,用于优雅地处理缺失索引和负数索引,以及长度超过目标序列的切片。这个方法会“整顿”元组,把 start、stop 和 stride 都变成非负数,而且都落在指定长度序列的边界内。
slice(None, 10, 2).indices(5) # (0, 5, 2)
(0, 5, 2)
slice(-3, None, None).indices(5) # (2, 5, 1)
(2, 5, 1)
能处理切片的__getitem__
方法
import numbers
def __len__(self):
return len(self._components)
def __getitem__(self, index):
cls = type(self)
if isinstance(index, slice):
return cls(self._components[index])
elif isinstance(index, numbers.Integral):
return self._components[index]
else:
msg = '{cls.__name__} indices must be integers'
raise TypeError(msg.format(cls=cls))
Vector 类第 3 版:动态存取属性
通过单个字母访问前几个分量的话会比较方便。比如,用 x、y 和 z 代替v[0]
、v[1]
和v[2]
。
shortcut_names = 'xyzt'
def __getattr__(self, name):
cls = type(self)
if len(name) == 1: # 如果属性名只有一个字母,可能是 shortcut_names 中的一个。
pos = cls.shortcut_names.find(name)
if 0 <= pos < len(self._components):
return self._components[pos]
msg = '{.__name__!r} object has no attribute {!r}'
raise AttributeError(msg.format(cls, name))
def __setattr__(self, name, value):
cls = type(self)
if len(name) == 1:
if name in cls.shortcut_names:
error = 'readonly attribute {attr_name!r}'
elif name.islower():
error = "can't set attributes 'a' to 'z' in {cls_name!r}"
else:
error = ''
if error:
msg = error.format(cls_name=cls.__name__, attr_name=name)
raise AttributeError(msg)
super().__setattr__(name, value)
Vector 第 4 版:散列和快速等值测试
from array import array
import reprlib
import math
import functools # ➊
import operator # ➋
class Vector:
typecode = 'd'
# 排版需要,省略了很多行...
def __eq__(self, other): # ➌
return tuple(self) == tuple(other)
def __hash__(self):
hashes = (hash(x) for x in self._components) # ➍
return functools.reduce(operator.xor, hashes, 0) # ➎
# 省略了很多行...
在 Python 2 中使用 map 函数效率低些,因为 map 函数要使用结果构建一个列表。但是在 Python 3 中,map 函数是惰性的,它会创建一个生成器,按需产出结果,因此能节省内存
# 效率更高的__eq__方法
def __eq__(self, other):
if len(self) != len(other): # ➊
return False
for a, b in zip(self, other): # ➋
if a != b: # ➌
return False
return True # ➍
# 单行写法
def __eq__(self, other):
return len(self) == len(other) and all(a == b for a, b in zip(self, other))
Vector 类第 5 版:格式化
# 所有的完整代码
"""
A multidimensional ``Vector`` class, take 5
A ``Vector`` is built from an iterable of numbers::
>>> Vector([3.1, 4.2])
Vector([3.1, 4.2])
>>> Vector((3, 4, 5))
Vector([3.0, 4.0, 5.0])
>>> Vector(range(10))
Vector([0.0, 1.0, 2.0, 3.0, 4.0, ...])
Tests with two dimensions (same results as ``vector2d_v1.py``)::
>>> v1 = Vector([3, 4])
>>> x, y = v1
>>> x, y
(3.0, 4.0)
>>> v1
Vector([3.0, 4.0])
>>> v1_clone = eval(repr(v1))
>>> v1 == v1_clone
True
>>> print(v1)
(3.0, 4.0)
>>> octets = bytes(v1)
>>> octets
b'd\\x00\\x00\\x00\\x00\\x00\\x00\\x08@\\x00\\x00\\x00\\x00\\x00\\x00\\x10@'
>>> abs(v1)
5.0
>>> bool(v1), bool(Vector([0, 0]))
(True, False)
Test of ``.frombytes()`` class method:
>>> v1_clone = Vector.frombytes(bytes(v1))
>>> v1_clone
Vector([3.0, 4.0])
>>> v1 == v1_clone
True
Tests with three dimensions::
>>> v1 = Vector([3, 4, 5])
>>> x, y, z = v1
>>> x, y, z
(3.0, 4.0, 5.0)
>>> v1
Vector([3.0, 4.0, 5.0])
>>> v1_clone = eval(repr(v1))
>>> v1 == v1_clone
True
>>> print(v1)
(3.0, 4.0, 5.0)
>>> abs(v1) # doctest:+ELLIPSIS
7.071067811...
>>> bool(v1), bool(Vector([0, 0, 0]))
(True, False)
Tests with many dimensions::
>>> v7 = Vector(range(7))
>>> v7
Vector([0.0, 1.0, 2.0, 3.0, 4.0, ...])
>>> abs(v7) # doctest:+ELLIPSIS
9.53939201...
Test of ``.__bytes__`` and ``.frombytes()`` methods::
>>> v1 = Vector([3, 4, 5])
>>> v1_clone = Vector.frombytes(bytes(v1))
>>> v1_clone
Vector([3.0, 4.0, 5.0])
>>> v1 == v1_clone
True
Tests of sequence behavior::
>>> v1 = Vector([3, 4, 5])
>>> len(v1)
3
>>> v1[0], v1[len(v1)-1], v1[-1]
(3.0, 5.0, 5.0)
Test of slicing::
>>> v7 = Vector(range(7))
>>> v7[-1]
6.0
>>> v7[1:4]
Vector([1.0, 2.0, 3.0])
>>> v7[-1:]
Vector([6.0])
>>> v7[1,2]
Traceback (most recent call last):
...
TypeError: Vector indices must be integers
Tests of dynamic attribute access::
>>> v7 = Vector(range(10))
>>> v7.x
0.0
>>> v7.y, v7.z, v7.t
(1.0, 2.0, 3.0)
Dynamic attribute lookup failures::
>>> v7.k
Traceback (most recent call last):
...
AttributeError: 'Vector' object has no attribute 'k'
>>> v3 = Vector(range(3))
>>> v3.t
Traceback (most recent call last):
...
AttributeError: 'Vector' object has no attribute 't'
>>> v3.spam
Traceback (most recent call last):
...
AttributeError: 'Vector' object has no attribute 'spam'
Tests of hashing::
>>> v1 = Vector([3, 4])
>>> v2 = Vector([3.1, 4.2])
>>> v3 = Vector([3, 4, 5])
>>> v6 = Vector(range(6))
>>> hash(v1), hash(v3), hash(v6)
(7, 2, 1)
Most hash values of non-integers vary from a 32-bit to 64-bit CPython build::
>>> import sys
>>> hash(v2) == (384307168202284039 if sys.maxsize > 2**32 else 357915986)
True
Tests of ``format()`` with Cartesian coordinates in 2D::
>>> v1 = Vector([3, 4])
>>> format(v1)
'(3.0, 4.0)'
>>> format(v1, '.2f')
'(3.00, 4.00)'
>>> format(v1, '.3e')
'(3.000e+00, 4.000e+00)'
Tests of ``format()`` with Cartesian coordinates in 3D and 7D::
>>> v3 = Vector([3, 4, 5])
>>> format(v3)
'(3.0, 4.0, 5.0)'
>>> format(Vector(range(7)))
'(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0)'
Tests of ``format()`` with spherical coordinates in 2D, 3D and 4D::
>>> format(Vector([1, 1]), 'h') # doctest:+ELLIPSIS
'<1.414213..., 0.785398...>'
>>> format(Vector([1, 1]), '.3eh')
'<1.414e+00, 7.854e-01>'
>>> format(Vector([1, 1]), '0.5fh')
'<1.41421, 0.78540>'
>>> format(Vector([1, 1, 1]), 'h') # doctest:+ELLIPSIS
'<1.73205..., 0.95531..., 0.78539...>'
>>> format(Vector([2, 2, 2]), '.3eh')
'<3.464e+00, 9.553e-01, 7.854e-01>'
>>> format(Vector([0, 0, 0]), '0.5fh')
'<0.00000, 0.00000, 0.00000>'
>>> format(Vector([-1,-1,-1,-1]), 'h') # doctest:+ELLIPSIS
'<2.0, 2.09439..., 2.18627..., 3.92699...>'
>>> format(Vector([2, 2, 2, 2]), '.3eh')
'<4.000e+00, 1.047e+00, 9.553e-01, 7.854e-01>'
>>> format(Vector([0, 1, 0, 0]), '0.5fh')
'<1.00000, 1.57080, 0.00000, 0.00000>'
"""
from array import array
import reprlib
import math
import numbers
import functools
import operator
import itertools
class Vector:
typecode = 'd'
def __init__(self, components):
self._components = array(self.typecode, components)
def __iter__(self):
return iter(self._components)
def __repr__(self):
components = reprlib.repr(self._components)
components = components[components.find('['):-1]
return 'Vector({})'.format(components)
def __str__(self):
return str(tuple(self))
def __bytes__(self):
return (bytes([ord(self.typecode)]) +
bytes(self._components))
def __eq__(self, other):
return (len(self) == len(other) and
all(a == b for a, b in zip(self, other)))
def __hash__(self):
hashes = (hash(x) for x in self)
return functools.reduce(operator.xor, hashes, 0)
def __abs__(self):
return math.sqrt(sum(x * x for x in self))
def __bool__(self):
return bool(abs(self))
def __len__(self):
return len(self._components)
def __getitem__(self, index):
cls = type(self)
if isinstance(index, slice):
return cls(self._components[index])
elif isinstance(index, numbers.Integral):
return self._components[index]
else:
msg = '{.__name__} indices must be integers'
raise TypeError(msg.format(cls))
shortcut_names = 'xyzt'
def __getattr__(self, name):
cls = type(self)
if len(name) == 1:
pos = cls.shortcut_names.find(name)
if 0 <= pos < len(self._components):
return self._components[pos]
msg = '{.__name__!r} object has no attribute {!r}'
raise AttributeError(msg.format(cls, name))
def angle(self, n):
r = math.sqrt(sum(x * x for x in self[n:]))
a = math.atan2(r, self[n-1])
if (n == len(self)-1) and (self[-1] < 0):
return math.pi * 2-a
else:
return a
def angles(self):
return (self.angle(n) for n in range(1, len(self)))
def __format__(self, fmt_spec=''):
if fmt_spec.endswith('h'): # 超球面坐标
fmt_spec = fmt_spec[:-1]
coords = itertools.chain([abs(self)],
self.angles())
outer_fmt = '<{}>'
else:
coords = self
outer_fmt = '({})'
components = (format(c, fmt_spec) for c in coords)
return outer_fmt.format(', '.join(components))
@classmethod
def frombytes(cls, octets):
typecode = chr(octets[0])
memv = memoryview(octets[1:]).cast(typecode)
return cls(memv)