如果查看scrapyd 内部处理以及scrapy 介绍的话,会发现有几个环境变量比较重要,scrapyd 在实际执行的时候会进行变量参数的处理
以下简单说明下
环境变量
- SCRAPY_SETTINGS_MODULE
配置相关的,对于scrapy 框架是基于配置文件的
def init_env(project: str = "default", set_syspath: bool = True) -> None:
"""Initialize environment to use command-line tool from inside a project
dir. This sets the Scrapy settings module and modifies the Python path to
be able to locate the project module.
"""
cfg = get_config()
if cfg.has_option("settings", project):
os.environ["SCRAPY_SETTINGS_MODULE"] = cfg.get("settings", project)
closest = closest_scrapy_cfg()
if closest:
projdir = str(Path(closest).parent)
if set_syspath and projdir not in sys.path:
sys.path.append(projdir)
scrapyd 运行egg 文件的时候会对于setup 的entry_point 进行解析
def activate_egg(eggpath):
"""Activate a Scrapy egg file. This is meant to be used from egg runners
to activate a Scrapy egg file. Don't use it from other code as it may
leave unwanted side effects.
"""
try:
d = next(pkg_resources.find_distributions(eggpath))
except StopIteration:
raise ValueError("Unknown or corrupt egg")
d.activate()
settings_module = d.get_entry_info('scrapy', 'settings').module_name
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', settings_module)
SCRAPY_PROJECT
项目相关的
scrapyd 在Launcher中依赖
def get_environment(self, message, slot):
project = message['_project']
env = self.initenv.copy()
env['SCRAPY_PROJECT'] = project
env['SCRAPYD_SLOT'] = str(slot)
env['SCRAPYD_SPIDER'] = message['_spider']
env['SCRAPYD_JOB'] = message['_job']
if '_version' in message:
env['SCRAPYD_EGG_VERSION'] = message['_version']
if project in self.settings:
env['SCRAPY_SETTINGS_MODULE'] = self.settings[project]
if self.logs_dir:
env['SCRAPYD_LOG_FILE'] = self._get_file(message, self.logs_dir, 'log')
if self.items_dir:
env['SCRAPYD_FEED_URI'] = self._get_feed_uri(message, 'jl')
return env
SCRAPY_PYTHON_SHELL
运行python 解释器相关的, 首先会从环境变量取,如果没有,就使用默认的
def start(
self,
url: Optional[str] = None,
request: Optional[Request] = None,
response: Optional[Response] = None,
spider: Optional[Spider] = None,
redirect: bool = True,
) -> None:
# disable accidental Ctrl-C key press from shutting down the engine
signal.signal(signal.SIGINT, signal.SIG_IGN)
if url:
self.fetch(url, spider, redirect=redirect)
elif request:
self.fetch(request, spider)
elif response:
request = response.request
self.populate_vars(response, request, spider)
else:
self.populate_vars()
if self.code:
print(eval(self.code, globals(), self.vars)) # nosec
else:
"""
Detect interactive shell setting in scrapy.cfg
e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg
[settings]
# shell can be one of ipython, bpython or python;
# to be used as the interactive python console, if available.
# (default is ipython, fallbacks in the order listed above)
shell = python
"""
cfg = get_config()
section, option = "settings", "shell"
env = os.environ.get("SCRAPY_PYTHON_SHELL")
shells = []
if env:
shells += env.strip().lower().split(",")
elif cfg.has_option(section, option):
shells += [cfg.get(section, option).strip().lower()]
else: # try all by default
shells += DEFAULT_PYTHON_SHELLS.keys()
# always add standard shell as fallback
shells += ["python"]
start_python_console(
self.vars, shells=shells, banner=self.vars.pop("banner", "")
)
"""Initialize environment to use command-line tool from inside a project
说明
scrapy 这几个变量实际中还是比较重要的,scrapyd 在运行的时候就进行了变量的处理,同时对于打包的egg 文件也进行了显示的配置