【Python排错】pip报错：LookupError (unknown encoding)

原创

Changxing长行 2022-10-29 15:31:16 博主文章分类：Python报错排坑 ©著作权

文章标签 Python pip LookupError html html5 文章分类 运维

©著作权归作者所有：来自51CTO博客作者Changxing长行的原创作品，请联系作者获取转载授权，否则将追究法律责任

报错命令

pip install pylint

报错信息

......
  File "/opt/py3.8/ve1/lib/python3.8/site-packages/pip/_internal/index/collector.py", line 351, in parse_links
    parser.feed(page.content.decode(encoding))
LookupError: unknown encoding: utf-8,gbk

解决方法

方案1：修改 pip 源。

方案2：修改 pip 源码。

修改 site-packages/pip/_internal/index/collector.py 文件中的 _make_html_page 函数如下：

def _make_html_page(response: Response, cache_link_parsing: bool = True) -> HTMLPage:
    # encoding = _get_encoding_from_headers(response.headers) # 屏蔽获取编码类型这一行
    encoding = 'utf-8'  # 添加这一行
    return HTMLPage(
        response.content,
        encoding=encoding,
        url=response.url,
        cache_link_parsing=cache_link_parsing,
    )

排查过程

1. `"site-packages/pip/_internal/index/collector.py", line 351`

@with_cached_html_pages
def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Link]:
    """
    Parse an HTML document, and yield its anchor elements as Link objects.
    """

    if use_deprecated_html5lib:
        yield from _parse_links_html5lib(page)
        return

    parser = HTMLLinkParser(page.url)
    encoding = page.encoding or "utf-8"
    parser.feed(page.content.decode(encoding))  # line 351

    url = page.url
    base_url = parser.base_url or url
    for anchor in parser.anchors:
        link = _create_link_from_element(
            anchor,
            page_url=url,
            base_url=base_url,
        )
        if link is None:
            continue
        yield link

`HTMLPage.encoding`

class HTMLPage:
    """Represents one page, along with its URL"""

    def __init__(
        self,
        content: bytes,
        encoding: Optional[str],
        url: str,
        cache_link_parsing: bool = True,
    ) -> None:
        """
        :param encoding: the encoding to decode the given content.
        :param url: the URL from which the HTML was downloaded.
        :param cache_link_parsing: whether links parsed from this page's url
                                   should be cached. PyPI index urls should
                                   have this set to False, for example.
        """
        self.content = content
        self.encoding = encoding
        self.url = url
        self.cache_link_parsing = cache_link_parsing

    def __str__(self) -> str:
        return redact_auth_from_url(self.url)

3. `"site-packages/pip/_internal/index/collector.py", line 309`

def with_cached_html_pages(fn: ParseLinks) -> ParseLinks:
    """
    Given a function that parses an Iterable[Link] from an HTMLPage, cache the
    function's result (keyed by CacheablePageContent), unless the HTMLPage
    `page` has `page.cache_link_parsing == False`.
    """

    @functools.lru_cache(maxsize=None)
    def wrapper(
        cacheable_page: CacheablePageContent, use_deprecated_html5lib: bool
    ) -> List[Link]:
        return list(fn(cacheable_page.page, use_deprecated_html5lib))

    @functools.wraps(fn)
    def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Link]:
        if page.cache_link_parsing:
            return wrapper(CacheablePageContent(page), use_deprecated_html5lib)
        return list(fn(page, use_deprecated_html5lib))  # line 309

    return wrapper_wrapper

4. `"site-packages/pip/_internal/index/package_finder.py", line 799`

def process_project_url(
        self, project_url: Link, link_evaluator: LinkEvaluator
    ) -> List[InstallationCandidate]:
        logger.debug(
            "Fetching project page and analyzing links: %s",
            project_url,
        )
        html_page = self._link_collector.fetch_page(project_url)
        if html_page is None:
            return []

        page_links = list(parse_links(html_page, self._use_deprecated_html5lib)) # line 799

        with indent_log():
            package_links = self.evaluate_links(
                link_evaluator,
                links=page_links,
            )

        return package_links

注意到 HTMLPage 类的实例在 html_page = self._link_collector.fetch_page(project_url) 构造。找到 fetch_page 方法中构造 page 的逻辑

def fetch_page(self, location: Link) -> Optional[HTMLPage]:
    """
    Fetch an HTML page containing package links.
    """
    return _get_html_page(location, session=self.session)

def _get_html_page(
    link: Link, session: Optional[PipSession] = None
) -> Optional["HTMLPage"]:
    if session is None:
        raise TypeError(
            "_get_html_page() missing 1 required keyword argument: 'session'"
        )

    url = link.url.split("#", 1)[0]

    # Check for VCS schemes that do not support lookup as web pages.
    vcs_scheme = _match_vcs_scheme(url)
    if vcs_scheme:
        logger.warning(
            "Cannot look at %s URL %s because it does not support lookup as web pages.",
            vcs_scheme,
            link,
        )
        return None

    # Tack index.html onto file:// URLs that point to directories
    scheme, _, path, _, _, _ = urllib.parse.urlparse(url)
    if scheme == "file" and os.path.isdir(urllib.request.url2pathname(path)):
        # add trailing slash if not present so urljoin doesn't trim
        # final segment
        if not url.endswith("/"):
            url += "/"
        url = urllib.parse.urljoin(url, "index.html")
        logger.debug(" file: URL is directory, getting %s", url)

    try:
        resp = _get_html_response(url, session=session)
    except _NotHTTP:
        logger.warning(
            "Skipping page %s because it looks like an archive, and cannot "
            "be checked by a HTTP HEAD request.",
            link,
        )
    except _NotHTML as exc:
        logger.warning(
            "Skipping page %s because the %s request got Content-Type: %s."
            "The only supported Content-Type is text/html",
            link,
            exc.request_desc,
            exc.content_type,
        )
    except NetworkConnectionError as exc:
        _handle_get_page_fail(link, exc)
    except RetryError as exc:
        _handle_get_page_fail(link, exc)
    except SSLError as exc:
        reason = "There was a problem confirming the ssl certificate: "
        reason += str(exc)
        _handle_get_page_fail(link, reason, meth=logger.info)
    except requests.ConnectionError as exc:
        _handle_get_page_fail(link, f"connection error: {exc}")
    except requests.Timeout:
        _handle_get_page_fail(link, "timed out")
    else:
        return _make_html_page(resp, cache_link_parsing=link.cache_link_parsing)
    return None

def _make_html_page(response: Response, cache_link_parsing: bool = True) -> HTMLPage:
    encoding = _get_encoding_from_headers(response.headers)
    return HTMLPage(
        response.content,
        encoding=encoding,
        url=response.url,
        cache_link_parsing=cache_link_parsing,
    )

def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
    """Determine if we have any encoding information in our headers."""
    if headers and "Content-Type" in headers:
        content_type, params = cgi.parse_header(headers["Content-Type"])
        if "charset" in params:
            return params["charset"]
    return None

这里就是获取编码类型的逻辑了。

5. 问题定位

说明从 pip 源返回的是 utf-8,gbk 编码类型，但是 pip 不能解析。请求了一下 pip 源后发现确实如此。

尝试更新 pip（python -m pip install --upgrade pip），但也会报同样的问题。

这是因为镜像网站给的是 utf-8,gbk；而 python3.8 中最新的 pip，不支持这样的编码类型。

6. 问题解决

方案1：修改 pip 源。

方案2：修改 pip 源码。

修改 site-packages/pip/_internal/index/collector.py 文件中的 _make_html_page 函数如下：

def _make_html_page(response: Response, cache_link_parsing: bool = True) -> HTMLPage:
    # encoding = _get_encoding_from_headers(response.headers) # 屏蔽获取编码类型这一行
    encoding = 'utf-8'  # 添加这一行
    return HTMLPage(
        response.content,
        encoding=encoding,
        url=response.url,
        cache_link_parsing=cache_link_parsing,
    )