diff options
Diffstat (limited to 'lib/bb/fetch2/wget.py')
-rw-r--r-- | lib/bb/fetch2/wget.py | 274 |
1 files changed, 153 insertions, 121 deletions
diff --git a/lib/bb/fetch2/wget.py b/lib/bb/fetch2/wget.py index 8f505b6de..fbfa6938a 100644 --- a/lib/bb/fetch2/wget.py +++ b/lib/bb/fetch2/wget.py @@ -1,5 +1,3 @@ -# ex:ts=4:sw=4:sts=4:et -# -*- tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- """ BitBake 'Fetch' implementations @@ -10,35 +8,24 @@ BitBake build tools. # Copyright (C) 2003, 2004 Chris Larson # -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# SPDX-License-Identifier: GPL-2.0-only # # Based on functions from the base bb module, Copyright 2003 Holger Schurig +import shlex import re import tempfile -import subprocess import os -import logging import errno import bb import bb.progress +import socket +import http.client import urllib.request, urllib.parse, urllib.error from bb.fetch2 import FetchMethod from bb.fetch2 import FetchError from bb.fetch2 import logger from bb.fetch2 import runfetchcmd -from bb.utils import export_proxies from bs4 import BeautifulSoup from bs4 import SoupStrainer @@ -65,11 +52,23 @@ class WgetProgressHandler(bb.progress.LineFilterProgressHandler): class Wget(FetchMethod): """Class to fetch urls via 'wget'""" + + # CDNs like CloudFlare may do a 'browser integrity test' which can fail + # with the standard wget/urllib User-Agent, so pretend to be a modern + # browser. + user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0" + + def check_certs(self, d): + """ + Should certificates be checked? + """ + return (d.getVar("BB_CHECK_SSL_CERTS") or "1") != "0" + def supports(self, ud, d): """ Check to see if a given url can be fetched with wget. """ - return ud.type in ['http', 'https', 'ftp'] + return ud.type in ['http', 'https', 'ftp', 'ftps'] def recommends_checksum(self, urldata): return True @@ -88,13 +87,19 @@ class Wget(FetchMethod): if not ud.localfile: ud.localfile = d.expand(urllib.parse.unquote(ud.host + ud.path).replace("/", ".")) - self.basecmd = d.getVar("FETCHCMD_wget") or "/usr/bin/env wget -t 2 -T 30 --passive-ftp --no-check-certificate" + self.basecmd = d.getVar("FETCHCMD_wget") or "/usr/bin/env wget -t 2 -T 30" + + if ud.type == 'ftp' or ud.type == 'ftps': + self.basecmd += " --passive-ftp" + + if not self.check_certs(d): + self.basecmd += " --no-check-certificate" def _runwget(self, ud, d, command, quiet, workdir=None): progresshandler = WgetProgressHandler(d) - logger.debug(2, "Fetching %s using command '%s'" % (ud.url, command)) + logger.debug2("Fetching %s using command '%s'" % (ud.url, command)) bb.fetch2.check_network_access(d, command, ud.url) runfetchcmd(command + ' --progress=dot -v', d, quiet, log=progresshandler, workdir=workdir) @@ -103,13 +108,22 @@ class Wget(FetchMethod): fetchcmd = self.basecmd - if 'downloadfilename' in ud.parm: - dldir = d.getVar("DL_DIR") - bb.utils.mkdirhier(os.path.dirname(dldir + os.sep + ud.localfile)) - fetchcmd += " -O " + dldir + os.sep + ud.localfile + localpath = os.path.join(d.getVar("DL_DIR"), ud.localfile) + ".tmp" + bb.utils.mkdirhier(os.path.dirname(localpath)) + fetchcmd += " -O %s" % shlex.quote(localpath) if ud.user and ud.pswd: - fetchcmd += " --user=%s --password=%s --auth-no-challenge" % (ud.user, ud.pswd) + fetchcmd += " --auth-no-challenge" + if ud.parm.get("redirectauth", "1") == "1": + # An undocumented feature of wget is that if the + # username/password are specified on the URI, wget will only + # send the Authorization header to the first host and not to + # any hosts that it is redirected to. With the increasing + # usage of temporary AWS URLs, this difference now matters as + # AWS will reject any request that has authentication both in + # the query parameters (from the redirect) and in the + # Authorization header. + fetchcmd += " --user=%s --password=%s" % (ud.user, ud.pswd) uri = ud.url.split(";")[0] if os.path.exists(ud.localpath): @@ -120,6 +134,15 @@ class Wget(FetchMethod): self._runwget(ud, d, fetchcmd, False) + # Try and verify any checksum now, meaning if it isn't correct, we don't remove the + # original file, which might be a race (imagine two recipes referencing the same + # source, one with an incorrect checksum) + bb.fetch2.verify_checksum(ud, d, localpath=localpath, fatal_nochecksum=False) + + # Remove the ".tmp" and move the file into position atomically + # Our lock prevents multiple writers but mirroring code may grab incomplete files + os.rename(localpath, localpath[:-4]) + # Sanity check since wget can pretend it succeed when it didn't # Also, this used to happen if sourceforge sent us to the mirror page if not os.path.exists(ud.localpath): @@ -132,10 +155,6 @@ class Wget(FetchMethod): return True def checkstatus(self, fetch, ud, d, try_again=True): - import urllib.request, urllib.error, urllib.parse, socket, http.client - from urllib.response import addinfourl - from bb.fetch2 import FetchConnectionCache - class HTTPConnectionCache(http.client.HTTPConnection): if fetch.connection_cache: def connect(self): @@ -168,7 +187,7 @@ class Wget(FetchMethod): """ host = req.host if not host: - raise urlllib2.URLError('no host given') + raise urllib.error.URLError('no host given') h = http_class(host, timeout=req.timeout) # will parse host:port h.set_debuglevel(self._debuglevel) @@ -185,7 +204,7 @@ class Wget(FetchMethod): # request. # Don't close connection when connection_cache is enabled, - if fetch.connection_cache is None: + if fetch.connection_cache is None: headers["Connection"] = "close" else: headers["Connection"] = "Keep-Alive" # Works for HTTP/1.0 @@ -219,15 +238,12 @@ class Wget(FetchMethod): # We let the request fail and expect it to be # tried once more ("try_again" in check_status()), # with the dead connection removed from the cache. - # If it still fails, we give up, which can happend for bad + # If it still fails, we give up, which can happen for bad # HTTP proxy settings. fetch.connection_cache.remove_connection(h.host, h.port) raise urllib.error.URLError(err) else: - try: - r = h.getresponse(buffering=True) - except TypeError: # buffering kw not supported - r = h.getresponse() + r = h.getresponse() # Pick apart the HTTPResponse object to get the addinfourl # object initialized properly. @@ -252,7 +268,7 @@ class Wget(FetchMethod): pass closed = False - resp = addinfourl(fp_dummy(), r.msg, req.get_full_url()) + resp = urllib.response.addinfourl(fp_dummy(), r.msg, req.get_full_url()) resp.code = r.status resp.msg = r.reason @@ -271,17 +287,18 @@ class Wget(FetchMethod): fp.read() fp.close() - newheaders = dict((k,v) for k,v in list(req.headers.items()) - if k.lower() not in ("content-length", "content-type")) - return self.parent.open(urllib.request.Request(req.get_full_url(), - headers=newheaders, - origin_req_host=req.origin_req_host, - unverifiable=True)) + if req.get_method() != 'GET': + newheaders = dict((k, v) for k, v in list(req.headers.items()) + if k.lower() not in ("content-length", "content-type")) + return self.parent.open(urllib.request.Request(req.get_full_url(), + headers=newheaders, + origin_req_host=req.origin_req_host, + unverifiable=True)) - """ - Some servers (e.g. GitHub archives, hosted on Amazon S3) return 403 - Forbidden when they actually mean 405 Method Not Allowed. - """ + raise urllib.request.HTTPError(req, code, msg, headers, None) + + # Some servers (e.g. GitHub archives, hosted on Amazon S3) return 403 + # Forbidden when they actually mean 405 Method Not Allowed. http_error_403 = http_error_405 @@ -292,57 +309,78 @@ class Wget(FetchMethod): """ def redirect_request(self, req, fp, code, msg, headers, newurl): newreq = urllib.request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl) - newreq.get_method = lambda: req.get_method() + newreq.get_method = req.get_method return newreq - exported_proxies = export_proxies(d) - - handlers = [FixedHTTPRedirectHandler, HTTPMethodFallback] - if export_proxies: - handlers.append(urllib.request.ProxyHandler()) - handlers.append(CacheHTTPHandler()) - # XXX: Since Python 2.7.9 ssl cert validation is enabled by default - # see PEP-0476, this causes verification errors on some https servers - # so disable by default. - import ssl - if hasattr(ssl, '_create_unverified_context'): - handlers.append(urllib.request.HTTPSHandler(context=ssl._create_unverified_context())) - opener = urllib.request.build_opener(*handlers) - - try: - uri = ud.url.split(";")[0] - r = urllib.request.Request(uri) - r.get_method = lambda: "HEAD" - # Some servers (FusionForge, as used on Alioth) require that the - # optional Accept header is set. - r.add_header("Accept", "*/*") - def add_basic_auth(login_str, request): - '''Adds Basic auth to http request, pass in login:password as string''' - import base64 - encodeuser = base64.b64encode(login_str.encode('utf-8')).decode("utf-8") - authheader = "Basic %s" % encodeuser - r.add_header("Authorization", authheader) - - if ud.user: - add_basic_auth(ud.user, r) - try: - import netrc, urllib.parse - n = netrc.netrc() - login, unused, password = n.authenticators(urllib.parse.urlparse(uri).hostname) - add_basic_auth("%s:%s" % (login, password), r) - except (TypeError, ImportError, IOError, netrc.NetrcParseError): - pass - - with opener.open(r) as response: - pass - except urllib.error.URLError as e: - if try_again: - logger.debug(2, "checkstatus: trying again") - return self.checkstatus(fetch, ud, d, False) + # We need to update the environment here as both the proxy and HTTPS + # handlers need variables set. The proxy needs http_proxy and friends to + # be set, and HTTPSHandler ends up calling into openssl to load the + # certificates. In buildtools configurations this will be looking at the + # wrong place for certificates by default: we set SSL_CERT_FILE to the + # right location in the buildtools environment script but as BitBake + # prunes prunes the environment this is lost. When binaries are executed + # runfetchcmd ensures these values are in the environment, but this is + # pure Python so we need to update the environment. + # + # Avoid tramping the environment too much by using bb.utils.environment + # to scope the changes to the build_opener request, which is when the + # environment lookups happen. + newenv = bb.fetch2.get_fetcher_environment(d) + + with bb.utils.environment(**newenv): + import ssl + + if self.check_certs(d): + context = ssl.create_default_context() else: - # debug for now to avoid spamming the logs in e.g. remote sstate searches - logger.debug(2, "checkstatus() urlopen failed: %s" % e) - return False + context = ssl._create_unverified_context() + + handlers = [FixedHTTPRedirectHandler, + HTTPMethodFallback, + urllib.request.ProxyHandler(), + CacheHTTPHandler(), + urllib.request.HTTPSHandler(context=context)] + opener = urllib.request.build_opener(*handlers) + + try: + uri_base = ud.url.split(";")[0] + uri = "{}://{}{}".format(urllib.parse.urlparse(uri_base).scheme, ud.host, ud.path) + r = urllib.request.Request(uri) + r.get_method = lambda: "HEAD" + # Some servers (FusionForge, as used on Alioth) require that the + # optional Accept header is set. + r.add_header("Accept", "*/*") + r.add_header("User-Agent", self.user_agent) + def add_basic_auth(login_str, request): + '''Adds Basic auth to http request, pass in login:password as string''' + import base64 + encodeuser = base64.b64encode(login_str.encode('utf-8')).decode("utf-8") + authheader = "Basic %s" % encodeuser + r.add_header("Authorization", authheader) + + if ud.user and ud.pswd: + add_basic_auth(ud.user + ':' + ud.pswd, r) + + try: + import netrc + auth_data = netrc.netrc().authenticators(urllib.parse.urlparse(uri).hostname) + if auth_data: + login, _, password = auth_data + add_basic_auth("%s:%s" % (login, password), r) + except (FileNotFoundError, netrc.NetrcParseError): + pass + + with opener.open(r, timeout=30) as response: + pass + except (urllib.error.URLError, ConnectionResetError, TimeoutError) as e: + if try_again: + logger.debug2("checkstatus: trying again") + return self.checkstatus(fetch, ud, d, False) + else: + # debug for now to avoid spamming the logs in e.g. remote sstate searches + logger.debug2("checkstatus() urlopen failed for %s: %s" % (uri,e)) + return False + return True def _parse_path(self, regex, s): @@ -396,18 +434,14 @@ class Wget(FetchMethod): (oldpn, oldpv, oldsuffix) = old (newpn, newpv, newsuffix) = new - """ - Check for a new suffix type that we have never heard of before - """ - if (newsuffix): + # Check for a new suffix type that we have never heard of before + if newsuffix: m = self.suffix_regex_comp.search(newsuffix) if not m: bb.warn("%s has a possible unknown suffix: %s" % (newpn, newsuffix)) return False - """ - Not our package so ignore it - """ + # Not our package so ignore it if oldpn != newpn: return False @@ -422,9 +456,8 @@ class Wget(FetchMethod): """ f = tempfile.NamedTemporaryFile() with tempfile.TemporaryDirectory(prefix="wget-index-") as workdir, tempfile.NamedTemporaryFile(dir=workdir, prefix="wget-listing-") as f: - agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.12) Gecko/20101027 Ubuntu/9.10 (karmic) Firefox/3.6.12" fetchcmd = self.basecmd - fetchcmd += " -O " + f.name + " --user-agent='" + agent + "' '" + uri + "'" + fetchcmd += " -O " + f.name + " --user-agent='" + self.user_agent + "' '" + uri + "'" try: self._runwget(ud, d, fetchcmd, True, workdir=workdir) fetchresult = f.read() @@ -473,15 +506,14 @@ class Wget(FetchMethod): return "" - def _check_latest_version_by_dir(self, dirver, package, package_regex, - current_version, ud, d): + def _check_latest_version_by_dir(self, dirver, package, package_regex, current_version, ud, d): """ - Scan every directory in order to get upstream version. + Scan every directory in order to get upstream version. """ version_dir = ['', '', ''] version = ['', '', ''] - dirver_regex = re.compile("(?P<pfx>\D*)(?P<ver>(\d+[\.\-_])+(\d+))") + dirver_regex = re.compile(r"(?P<pfx>\D*)(?P<ver>(\d+[\.\-_])*(\d+))") s = dirver_regex.search(dirver) if s: version_dir[1] = s.group('ver') @@ -541,26 +573,26 @@ class Wget(FetchMethod): gst-fluendo-mp3 """ # match most patterns which uses "-" as separator to version digits - pn_prefix1 = "[a-zA-Z][a-zA-Z0-9]*([-_][a-zA-Z]\w+)*\+?[-_]" + pn_prefix1 = r"[a-zA-Z][a-zA-Z0-9]*([-_][a-zA-Z]\w+)*\+?[-_]" # a loose pattern such as for unzip552.tar.gz - pn_prefix2 = "[a-zA-Z]+" + pn_prefix2 = r"[a-zA-Z]+" # a loose pattern such as for 80325-quicky-0.4.tar.gz - pn_prefix3 = "[0-9]+[-]?[a-zA-Z]+" + pn_prefix3 = r"[0-9]+[-]?[a-zA-Z]+" # Save the Package Name (pn) Regex for use later - pn_regex = "(%s|%s|%s)" % (pn_prefix1, pn_prefix2, pn_prefix3) + pn_regex = r"(%s|%s|%s)" % (pn_prefix1, pn_prefix2, pn_prefix3) # match version - pver_regex = "(([A-Z]*\d+[a-zA-Z]*[\.\-_]*)+)" + pver_regex = r"(([A-Z]*\d+[a-zA-Z]*[\.\-_]*)+)" # match arch parch_regex = "-source|_all_" # src.rpm extension was added only for rpm package. Can be removed if the rpm # packaged will always be considered as having to be manually upgraded - psuffix_regex = "(tar\.gz|tgz|tar\.bz2|zip|xz|tar\.lz|rpm|bz2|orig\.tar\.gz|tar\.xz|src\.tar\.gz|src\.tgz|svnr\d+\.tar\.bz2|stable\.tar\.gz|src\.rpm)" + psuffix_regex = r"(tar\.\w+|tgz|zip|xz|rpm|bz2|orig\.tar\.\w+|src\.tar\.\w+|src\.tgz|svnr\d+\.tar\.\w+|stable\.tar\.\w+|src\.rpm)" # match name, version and archive type of a package - package_regex_comp = re.compile("(?P<name>%s?\.?v?)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s$)" + package_regex_comp = re.compile(r"(?P<name>%s?\.?v?)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s$)" % (pn_regex, pver_regex, parch_regex, psuffix_regex)) self.suffix_regex_comp = re.compile(psuffix_regex) @@ -572,7 +604,7 @@ class Wget(FetchMethod): version = self._parse_path(package_regex_comp, package) if version: package_custom_regex_comp = re.compile( - "(?P<name>%s)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s)" % + r"(?P<name>%s)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s)" % (re.escape(version[0]), pver_regex, parch_regex, psuffix_regex)) else: package_custom_regex_comp = None @@ -589,7 +621,7 @@ class Wget(FetchMethod): current_version = ['', d.getVar('PV'), ''] """possible to have no version in pkg name, such as spectrum-fw""" - if not re.search("\d+", package): + if not re.search(r"\d+", package): current_version[1] = re.sub('_', '.', current_version[1]) current_version[1] = re.sub('-', '.', current_version[1]) return (current_version[1], '') @@ -607,13 +639,13 @@ class Wget(FetchMethod): # search for version matches on folders inside the path, like: # "5.7" in http://download.gnome.org/sources/${PN}/5.7/${PN}-${PV}.tar.gz - dirver_regex = re.compile("(?P<dirver>[^/]*(\d+\.)*\d+([-_]r\d+)*)/") - m = dirver_regex.search(path) + dirver_regex = re.compile(r"(?P<dirver>[^/]*(\d+\.)*\d+([-_]r\d+)*)/") + m = dirver_regex.findall(path) if m: pn = d.getVar('PN') - dirver = m.group('dirver') + dirver = m[-1][0] - dirver_pn_regex = re.compile("%s\d?" % (re.escape(pn))) + dirver_pn_regex = re.compile(r"%s\d?" % (re.escape(pn))) if not dirver_pn_regex.search(dirver): return (self._check_latest_version_by_dir(dirver, package, package_regex, current_version, ud, d), '') |