序列的修改、散列和切片

此笔记记录于《流畅的 python》，大部分为其中的摘要，少部分为笔者自己的理解；笔记为 jupyter 转的 markdown，原始版 jupyter 笔记在这个仓库

不要检查它是不是鸭子：检查它的叫声像不像鸭子、它的走路姿势像不像鸭子，等等。具体检查什么取决于你想使用语言的哪些行为 m

python

from array import array
import reprlib
import math


class Vector:
    typecode = 'd'

    def __init__(self, components):
        # “受保护的”实例属性，把 Vector 的分量保存在一个数组中，之前 2D 的只有 x 和 y，现在是任意维度的
        self._components = array(self.typecode, components)

    def __iter__(self):
        return iter(self._components)

    def __repr__(self):
        components = reprlib.repr(self._components)
        components = components[components.find('['):-1]
        return 'Vector({})'.format(components)

    def __str__(self):
        return str(tuple(self))

    def __bytes__(self):
        return (bytes([ord(self.typecode)]) +
                bytes(self._components))

    def __eq__(self, other):
        return tuple(self) == tuple(other)

    def __abs__(self):
        return math.sqrt(sum(x * x for x in self))

    def __bool__(self):
        return bool(abs(self))

    @classmethod
    def frombytes(cls, octets):
        typecode = chr(octets[0])
        memv = memoryview(octets[1:]).cast(typecode)
        return cls(memv)

python

Vector([3.1, 4.2])

Vector([3.1, 4.2])

python

Vector((3, 4, 5))

Vector([3.0, 4.0, 5.0])

python

Vector(range(10))

Vector([0.0, 1.0, 2.0, 3.0, 4.0, ...])

调用repr()函数的目的是调试，因此绝对不能抛出异常。如果__repr__方法的实现有问题，那么必须处理，尽量输出有用的内容，让用户能够识别目标对象。

顺便说一下，我们本可以让 Vector 继承 Vector2d，但是我没这么做，原因有二：

其一，两个构造方法不兼容，因此不建议继承。这一点可以通过适当处理__init__方法的参数解决，
不过第二个原因更重要：我想把 Vector 类当作单独的示例，以此实现序列协议。接下来，我们先讨论协议这个术语，然后实现序列协议。

协议和鸭子类型

在面向对象编程中，协议是非正式的接口，只在文档中定义，在代码中不定义。例如，Python 的序列协议只需要__len__和__getitem__两个方法。任何类（如 Spam），只要使用标准的签名和语义实现了这两个方法，就能用在任何期待序列的地方。Spam 是不是哪个类的子类无关紧要，只要提供了所需的方法即可。

python

import collections
Card = collections.namedtuple('Card', ['rank', 'suit'])


class FrenchDeck:
    ranks = [str(n) for n in range(2, 11)]+list('JQKA')
    suits = 'spades diamonds clubs hearts'.split()

    def __init__(self):
        self._cards = [Card(rank, suit) for suit in self.suits
                       for rank in self.ranks]

    def __len__(self):
        return len(self._cards)

    def __getitem__(self, position):
        return self._cards[position]

FrenchDeck 类能充分利用 Python 的很多功能，因为它实现了序列协议，不过代码中并没有声明这一点。任何有经验的 Python 程序员只要看一眼就知道它是序列，即便它是 object 的子类也无妨。我们说它是序列，因为它的行为像序列，这才是重点。

协议是非正式的，没有强制力，因此如果你知道类的具体使用场景，通常只需要实现一个协议的部分。例如，为了支持迭代，只需实现__getitem__方法，没必要提供__len__方法。

Vector 类第 2 版：可切片的序列

python

class Vector:
    # 省略了很多行
    # ...
    def __len__(self):
        return len(self._components)

    def __getitem__(self, index):
        return self._components[index]

上述添加了__len__和__getitem__方法后，就可以支持切片了

切片原理

python

class MySeq:
    def __getitem__(self, index):
        return index # __getitem__直接返回传给它的值。

python

s = MySeq()
s[1]

python

s[1:4]

slice(1, 4, None)

python

s[1:4:2]

slice(1, 4, 2)

python

s[1:4:2, 9] # 如果[]中有逗号，那么__getitem__收到的是元组。

(slice(1, 4, 2), 9)

python

s[1:4:2, 7:9] # 元组中甚至可以有多个切片对象

(slice(1, 4, 2), slice(7, 9, None))

查看 slice 类的属性

python

slice

slice

python

dir(slice)

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'indices',
 'start',
 'step',
 'stop']

python

help(slice.indices) # 给定一个长度，处理边界情况

Help on method_descriptor:

indices(...)
    S.indices(len) -> (start, stop, stride)
    
    Assuming a sequence of length len, calculate the start and stop
    indices, and the stride length of the extended slice described by
    S. Out of bounds indices are clipped in a manner consistent with the
    handling of normal slices.

给定长度为 len 的序列，计算 S 表示的扩展切片的起始（start）和结尾（stop）索引，以及步幅（stride）。超出边界的索引会被截掉，这与常规切片的处理方式一样。

换句话说，indices 方法开放了内置序列实现的棘手逻辑，用于优雅地处理缺失索引和负数索引，以及长度超过目标序列的切片。这个方法会“整顿”元组，把 start、stop 和 stride 都变成非负数，而且都落在指定长度序列的边界内。

python

slice(None, 10, 2).indices(5) # (0, 5, 2)

(0, 5, 2)

python

slice(-3, None, None).indices(5) # (2, 5, 1)

(2, 5, 1)

能处理切片的__getitem__方法

python

import numbers


def __len__(self):
    return len(self._components)


def __getitem__(self, index):
    cls = type(self)
    if isinstance(index, slice):
        return cls(self._components[index])
    elif isinstance(index, numbers.Integral):
        return self._components[index]
    else:
        msg = '{cls.__name__} indices must be integers'
        raise TypeError(msg.format(cls=cls))

Vector 类第 3 版：动态存取属性

通过单个字母访问前几个分量的话会比较方便。比如，用 x、y 和 z 代替v[0]、v[1]和v[2]。

python

shortcut_names = 'xyzt'


def __getattr__(self, name):
    cls = type(self)
    if len(name) == 1:  # 如果属性名只有一个字母，可能是 shortcut_names 中的一个。
        pos = cls.shortcut_names.find(name)
        if 0 <= pos < len(self._components):
            return self._components[pos]
    msg = '{.__name__!r} object has no attribute {!r}'
    raise AttributeError(msg.format(cls, name))


def __setattr__(self, name, value):
    cls = type(self)
    if len(name) == 1:
        if name in cls.shortcut_names:
            error = 'readonly attribute {attr_name!r}'
        elif name.islower():
            error = "can't set attributes 'a' to 'z' in {cls_name!r}"
        else:
            error = ''
        if error:
            msg = error.format(cls_name=cls.__name__, attr_name=name)
            raise AttributeError(msg)
    super().__setattr__(name, value)

Vector 第 4 版：散列和快速等值测试

python

from array import array
import reprlib
import math
import functools  # ➊
import operator  # ➋


class Vector:
    typecode = 'd'
    # 排版需要，省略了很多行...

    def __eq__(self, other):  # ➌
        return tuple(self) == tuple(other)

    def __hash__(self):
        hashes = (hash(x) for x in self._components)  # ➍
        return functools.reduce(operator.xor, hashes, 0)  # ➎
    # 省略了很多行...

在 Python 2 中使用 map 函数效率低些，因为 map 函数要使用结果构建一个列表。但是在 Python 3 中，map 函数是惰性的，它会创建一个生成器，按需产出结果，因此能节省内存

python

# 效率更高的__eq__方法
def __eq__(self, other):
    if len(self) != len(other):  # ➊
        return False
    for a, b in zip(self, other):  # ➋
        if a != b:  # ➌
            return False
    return True  # ➍

python

# 单行写法
def __eq__(self, other):
    return len(self) == len(other) and all(a == b for a, b in zip(self, other))

Vector 类第 5 版：格式化

python

# 所有的完整代码
"""
A multidimensional ``Vector`` class, take 5
A ``Vector`` is built from an iterable of numbers::
    >>> Vector([3.1, 4.2])
    Vector([3.1, 4.2])
    >>> Vector((3, 4, 5))
    Vector([3.0, 4.0, 5.0])
    >>> Vector(range(10))
    Vector([0.0, 1.0, 2.0, 3.0, 4.0, ...])
Tests with two dimensions (same results as ``vector2d_v1.py``)::
    >>> v1 = Vector([3, 4])
    >>> x, y = v1
    >>> x, y
    (3.0, 4.0)
    >>> v1
    Vector([3.0, 4.0])
    >>> v1_clone = eval(repr(v1))
    >>> v1 == v1_clone
    True
    >>> print(v1)
    (3.0, 4.0)
    >>> octets = bytes(v1)
    >>> octets
    b'd\\x00\\x00\\x00\\x00\\x00\\x00\\x08@\\x00\\x00\\x00\\x00\\x00\\x00\\x10@'
    >>> abs(v1)
    5.0
    >>> bool(v1), bool(Vector([0, 0]))
    (True, False)
Test of ``.frombytes()`` class method:
    >>> v1_clone = Vector.frombytes(bytes(v1))
    >>> v1_clone
    Vector([3.0, 4.0])
    >>> v1 == v1_clone
    True
Tests with three dimensions::
    >>> v1 = Vector([3, 4, 5])
    >>> x, y, z = v1
    >>> x, y, z
    (3.0, 4.0, 5.0)
    >>> v1
    Vector([3.0, 4.0, 5.0])
    >>> v1_clone = eval(repr(v1))
    >>> v1 == v1_clone
    True
    >>> print(v1)
    (3.0, 4.0, 5.0)
    >>> abs(v1)  # doctest:+ELLIPSIS
    7.071067811...
    >>> bool(v1), bool(Vector([0, 0, 0]))
    (True, False)
Tests with many dimensions::
    >>> v7 = Vector(range(7))
    >>> v7
    Vector([0.0, 1.0, 2.0, 3.0, 4.0, ...])
    >>> abs(v7)  # doctest:+ELLIPSIS
    9.53939201...
Test of ``.__bytes__`` and ``.frombytes()`` methods::
    >>> v1 = Vector([3, 4, 5])
    >>> v1_clone = Vector.frombytes(bytes(v1))
    >>> v1_clone
    Vector([3.0, 4.0, 5.0])
    >>> v1 == v1_clone
    True
Tests of sequence behavior::
    >>> v1 = Vector([3, 4, 5])
    >>> len(v1)
    3
    >>> v1[0], v1[len(v1)-1], v1[-1]
    (3.0, 5.0, 5.0)
Test of slicing::
    >>> v7 = Vector(range(7))
    >>> v7[-1]
    6.0
    >>> v7[1:4]
    Vector([1.0, 2.0, 3.0])
    >>> v7[-1:]
    Vector([6.0])
    >>> v7[1,2]
    Traceback (most recent call last):
      ...
    TypeError: Vector indices must be integers
Tests of dynamic attribute access::
    >>> v7 = Vector(range(10))
    >>> v7.x
    0.0
    >>> v7.y, v7.z, v7.t
    (1.0, 2.0, 3.0)
Dynamic attribute lookup failures::
    >>> v7.k
    Traceback (most recent call last):
      ...
    AttributeError: 'Vector' object has no attribute 'k'
    >>> v3 = Vector(range(3))
    >>> v3.t
    Traceback (most recent call last):
      ...
    AttributeError: 'Vector' object has no attribute 't'
    >>> v3.spam
    Traceback (most recent call last):
      ...
    AttributeError: 'Vector' object has no attribute 'spam'
Tests of hashing::
    >>> v1 = Vector([3, 4])
    >>> v2 = Vector([3.1, 4.2])
    >>> v3 = Vector([3, 4, 5])
    >>> v6 = Vector(range(6))
    >>> hash(v1), hash(v3), hash(v6)
    (7, 2, 1)
Most hash values of non-integers vary from a 32-bit to 64-bit CPython build::
    >>> import sys
    >>> hash(v2) == (384307168202284039 if sys.maxsize > 2**32 else 357915986)
    True
Tests of ``format()`` with Cartesian coordinates in 2D::
    >>> v1 = Vector([3, 4])
    >>> format(v1)
    '(3.0, 4.0)'
    >>> format(v1, '.2f')
    '(3.00, 4.00)'
    >>> format(v1, '.3e')
    '(3.000e+00, 4.000e+00)'
Tests of ``format()`` with Cartesian coordinates in 3D and 7D::
    >>> v3 = Vector([3, 4, 5])
    >>> format(v3)
    '(3.0, 4.0, 5.0)'
    >>> format(Vector(range(7)))
    '(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0)'
Tests of ``format()`` with spherical coordinates in 2D, 3D and 4D::
    >>> format(Vector([1, 1]), 'h')  # doctest:+ELLIPSIS
    '<1.414213..., 0.785398...>'
    >>> format(Vector([1, 1]), '.3eh')
    '<1.414e+00, 7.854e-01>'
    >>> format(Vector([1, 1]), '0.5fh')
    '<1.41421, 0.78540>'
    >>> format(Vector([1, 1, 1]), 'h')  # doctest:+ELLIPSIS
    '<1.73205..., 0.95531..., 0.78539...>'
    >>> format(Vector([2, 2, 2]), '.3eh')
    '<3.464e+00, 9.553e-01, 7.854e-01>'
    >>> format(Vector([0, 0, 0]), '0.5fh')
    '<0.00000, 0.00000, 0.00000>'
    >>> format(Vector([-1,-1,-1,-1]), 'h')  # doctest:+ELLIPSIS
    '<2.0, 2.09439..., 2.18627..., 3.92699...>'
    >>> format(Vector([2, 2, 2, 2]), '.3eh')
    '<4.000e+00, 1.047e+00, 9.553e-01, 7.854e-01>'
    >>> format(Vector([0, 1, 0, 0]), '0.5fh')
    '<1.00000, 1.57080, 0.00000, 0.00000>'
"""
from array import array
import reprlib
import math
import numbers
import functools
import operator
import itertools


class Vector:
    typecode = 'd'

    def __init__(self, components):
        self._components = array(self.typecode, components)

    def __iter__(self):
        return iter(self._components)

    def __repr__(self):
        components = reprlib.repr(self._components)
        components = components[components.find('['):-1]
        return 'Vector({})'.format(components)

    def __str__(self):
        return str(tuple(self))

    def __bytes__(self):
        return (bytes([ord(self.typecode)]) +
                bytes(self._components))

    def __eq__(self, other):
        return (len(self) == len(other) and
                all(a == b for a, b in zip(self, other)))

    def __hash__(self):
        hashes = (hash(x) for x in self)
        return functools.reduce(operator.xor, hashes, 0)

    def __abs__(self):
        return math.sqrt(sum(x * x for x in self))

    def __bool__(self):
        return bool(abs(self))

    def __len__(self):
        return len(self._components)

    def __getitem__(self, index):
        cls = type(self)
        if isinstance(index, slice):
            return cls(self._components[index])
        elif isinstance(index, numbers.Integral):
            return self._components[index]
        else:
            msg = '{.__name__} indices must be integers'
            raise TypeError(msg.format(cls))
    shortcut_names = 'xyzt'

    def __getattr__(self, name):
        cls = type(self)
        if len(name) == 1:
            pos = cls.shortcut_names.find(name)
            if 0 <= pos < len(self._components):
                return self._components[pos]
        msg = '{.__name__!r} object has no attribute {!r}'
        raise AttributeError(msg.format(cls, name))

    def angle(self, n):
        r = math.sqrt(sum(x * x for x in self[n:]))
        a = math.atan2(r, self[n-1])
        if (n == len(self)-1) and (self[-1] < 0):
            return math.pi * 2-a
        else:
            return a

    def angles(self):
        return (self.angle(n) for n in range(1, len(self)))

    def __format__(self, fmt_spec=''):
        if fmt_spec.endswith('h'):  # 超球面坐标
            fmt_spec = fmt_spec[:-1]
            coords = itertools.chain([abs(self)],
                                     self.angles())
            outer_fmt = '<{}>'
        else:
            coords = self
            outer_fmt = '({})'
        components = (format(c, fmt_spec) for c in coords)
        return outer_fmt.format(', '.join(components))

    @classmethod
    def frombytes(cls, octets):
        typecode = chr(octets[0])
        memv = memoryview(octets[1:]).cast(typecode)
        return cls(memv)

序列的修改、散列和切片 ​

协议和鸭子类型 ​

Vector 类第 2 版：可切片的序列 ​

Vector 类第 3 版：动态存取属性 ​

Vector 第 4 版：散列和快速等值测试 ​

Vector 类第 5 版：格式化 ​

序列的修改、散列和切片

协议和鸭子类型

Vector 类第 2 版：可切片的序列

Vector 类第 3 版：动态存取属性

Vector 第 4 版：散列和快速等值测试

Vector 类第 5 版：格式化