报错命令

pip install pylint

报错信息

......
File "/opt/py3.8/ve1/lib/python3.8/site-packages/pip/_internal/index/collector.py", line 351, in parse_links
parser.feed(page.content.decode(encoding))
LookupError: unknown encoding: utf-8,gbk

解决方法

方案1:修改 pip 源。

方案2:修改 pip 源码。

修改 ​​site-packages/pip/_internal/index/collector.py​​​ 文件中的 ​​_make_html_page​​ 函数如下:

def _make_html_page(response: Response, cache_link_parsing: bool = True) -> HTMLPage:
# encoding = _get_encoding_from_headers(response.headers) # 屏蔽获取编码类型这一行
encoding = 'utf-8' # 添加这一行
return HTMLPage(
response.content,
encoding=encoding,
url=response.url,
cache_link_parsing=cache_link_parsing,
)

排查过程

1. ​​"site-packages/pip/_internal/index/collector.py", line 351​
@with_cached_html_pages
def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Link]:
"""
Parse an HTML document, and yield its anchor elements as Link objects.
"""

if use_deprecated_html5lib:
yield from _parse_links_html5lib(page)
return

parser = HTMLLinkParser(page.url)
encoding = page.encoding or "utf-8"
parser.feed(page.content.decode(encoding)) # line 351

url = page.url
base_url = parser.base_url or url
for anchor in parser.anchors:
link = _create_link_from_element(
anchor,
page_url=url,
base_url=base_url,
)
if link is None:
continue
yield link
​HTMLPage.encoding​
class HTMLPage:
"""Represents one page, along with its URL"""

def __init__(
self,
content: bytes,
encoding: Optional[str],
url: str,
cache_link_parsing: bool = True,
) -> None:
"""
:param encoding: the encoding to decode the given content.
:param url: the URL from which the HTML was downloaded.
:param cache_link_parsing: whether links parsed from this page's url
should be cached. PyPI index urls should
have this set to False, for example.
"""
self.content = content
self.encoding = encoding
self.url = url
self.cache_link_parsing = cache_link_parsing

def __str__(self) -> str:
return redact_auth_from_url(self.url)
3. ​​"site-packages/pip/_internal/index/collector.py", line 309​
def with_cached_html_pages(fn: ParseLinks) -> ParseLinks:
"""
Given a function that parses an Iterable[Link] from an HTMLPage, cache the
function's result (keyed by CacheablePageContent), unless the HTMLPage
`page` has `page.cache_link_parsing == False`.
"""

@functools.lru_cache(maxsize=None)
def wrapper(
cacheable_page: CacheablePageContent, use_deprecated_html5lib: bool
) -> List[Link]:
return list(fn(cacheable_page.page, use_deprecated_html5lib))

@functools.wraps(fn)
def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Link]:
if page.cache_link_parsing:
return wrapper(CacheablePageContent(page), use_deprecated_html5lib)
return list(fn(page, use_deprecated_html5lib)) # line 309

return wrapper_wrapper
4. ​​"site-packages/pip/_internal/index/package_finder.py", line 799​
def process_project_url(
self, project_url: Link, link_evaluator: LinkEvaluator
) -> List[InstallationCandidate]:
logger.debug(
"Fetching project page and analyzing links: %s",
project_url,
)
html_page = self._link_collector.fetch_page(project_url)
if html_page is None:
return []

page_links = list(parse_links(html_page, self._use_deprecated_html5lib)) # line 799

with indent_log():
package_links = self.evaluate_links(
link_evaluator,
links=page_links,
)

return package_links

注意到 ​​HTMLPage​​​ 类的实例在 ​​html_page = self._link_collector.fetch_page(project_url)​​​ 构造。找到 ​​fetch_page​​​ 方法中构造 ​​page​​ 的逻辑

def fetch_page(self, location: Link) -> Optional[HTMLPage]:
"""
Fetch an HTML page containing package links.
"""
return _get_html_page(location, session=self.session)
def _get_html_page(
link: Link, session: Optional[PipSession] = None
) -> Optional["HTMLPage"]:
if session is None:
raise TypeError(
"_get_html_page() missing 1 required keyword argument: 'session'"
)

url = link.url.split("#", 1)[0]

# Check for VCS schemes that do not support lookup as web pages.
vcs_scheme = _match_vcs_scheme(url)
if vcs_scheme:
logger.warning(
"Cannot look at %s URL %s because it does not support lookup as web pages.",
vcs_scheme,
link,
)
return None

# Tack index.html onto file:// URLs that point to directories
scheme, _, path, _, _, _ = urllib.parse.urlparse(url)
if scheme == "file" and os.path.isdir(urllib.request.url2pathname(path)):
# add trailing slash if not present so urljoin doesn't trim
# final segment
if not url.endswith("/"):
url += "/"
url = urllib.parse.urljoin(url, "index.html")
logger.debug(" file: URL is directory, getting %s", url)

try:
resp = _get_html_response(url, session=session)
except _NotHTTP:
logger.warning(
"Skipping page %s because it looks like an archive, and cannot "
"be checked by a HTTP HEAD request.",
link,
)
except _NotHTML as exc:
logger.warning(
"Skipping page %s because the %s request got Content-Type: %s."
"The only supported Content-Type is text/html",
link,
exc.request_desc,
exc.content_type,
)
except NetworkConnectionError as exc:
_handle_get_page_fail(link, exc)
except RetryError as exc:
_handle_get_page_fail(link, exc)
except SSLError as exc:
reason = "There was a problem confirming the ssl certificate: "
reason += str(exc)
_handle_get_page_fail(link, reason, meth=logger.info)
except requests.ConnectionError as exc:
_handle_get_page_fail(link, f"connection error: {exc}")
except requests.Timeout:
_handle_get_page_fail(link, "timed out")
else:
return _make_html_page(resp, cache_link_parsing=link.cache_link_parsing)
return None
def _make_html_page(response: Response, cache_link_parsing: bool = True) -> HTMLPage:
encoding = _get_encoding_from_headers(response.headers)
return HTMLPage(
response.content,
encoding=encoding,
url=response.url,
cache_link_parsing=cache_link_parsing,
)
def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
"""Determine if we have any encoding information in our headers."""
if headers and "Content-Type" in headers:
content_type, params = cgi.parse_header(headers["Content-Type"])
if "charset" in params:
return params["charset"]
return None

这里就是获取编码类型的逻辑了。

5. 问题定位

说明从 pip 源返回的是 ​​utf-8,gbk​​ 编码类型,但是 pip 不能解析。请求了一下 pip 源后发现确实如此。

尝试更新 pip(​​python -m pip install --upgrade pip​​),但也会报同样的问题。

这是因为镜像网站给的是 ​​utf-8,gbk​​;而 python3.8 中最新的 pip,不支持这样的编码类型。

6. 问题解决

方案1:修改 pip 源。

方案2:修改 pip 源码。

修改 ​​site-packages/pip/_internal/index/collector.py​​​ 文件中的 ​​_make_html_page​​ 函数如下:

def _make_html_page(response: Response, cache_link_parsing: bool = True) -> HTMLPage:
# encoding = _get_encoding_from_headers(response.headers) # 屏蔽获取编码类型这一行
encoding = 'utf-8' # 添加这一行
return HTMLPage(
response.content,
encoding=encoding,
url=response.url,
cache_link_parsing=cache_link_parsing,
)