commit 7184aed2ed1b841ee2416c765d3e6d59ddb9df41 Author: Yohane Date: Thu Dec 21 18:26:12 2023 +0800 First commit diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..af02bac --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.py text=auto eol=lf diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..8fb0f7f --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,100 @@ +name: PyInstaller + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + build: + runs-on: ${{ matrix.os }} + + strategy: + matrix: + os: [windows-latest, macos-latest, ubuntu-latest] + + steps: + - uses: actions/checkout@v2 + + - name: Install UPX + uses: crazy-max/ghaction-upx@v2 + if: matrix.os == 'windows-latest' || matrix.os == 'ubuntu-latest' + with: + install-only: true + + - name: UPX version + if: matrix.os == 'windows-latest' || matrix.os == 'ubuntu-latest' + run: upx --version + + - name: Setup Python 3.10 + uses: actions/setup-python@v2 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install face_recognition --no-deps + pip install pyinstaller + + - name: Test number_perser.get_number + run: | + python number_parser.py -v + + - name: Build with PyInstaller for macos/ubuntu + if: matrix.os == 'macos-latest' || matrix.os == 'ubuntu-latest' + run: | + pyinstaller \ + --onefile Movie_Data_Capture.py \ + --python-option u \ + --hidden-import "ImageProcessing.cnn" \ + --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ + --add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \ + --add-data "$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1):face_recognition_models" \ + --add-data "Img:Img" \ + --add-data "scrapinglib:scrapinglib" \ + --add-data "config.ini:." \ + + - name: Build with PyInstaller for windows + if: matrix.os == 'windows-latest' + run: | + pyinstaller ` + --onefile Movie_Data_Capture.py ` + --python-option u ` + --hidden-import "ImageProcessing.cnn" ` + --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" ` + --add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1);opencc" ` + --add-data "$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1);face_recognition_models" ` + --add-data "Img;Img" ` + --add-data "scrapinglib;scrapinglib" ` + --add-data "config.ini;." ` + + - name: Copy config.ini + run: | + cp config.ini dist/ + + - name: Set VERSION variable for macos/ubuntu + if: matrix.os == 'macos-latest' || matrix.os == 'ubuntu-latest' + run: | + echo "VERSION=$(python Movie_Data_Capture.py --version)" >> $GITHUB_ENV + + - name: Set VERSION variable for windows + if: matrix.os == 'windows-latest' + run: | + echo "VERSION=$(python Movie_Data_Capture.py --version)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + + - name: Upload build artifact + uses: actions/upload-artifact@v1 + with: + name: MDC-${{ env.VERSION }}-${{ runner.os }}-amd64 + path: dist + + - name: Run test (Ubuntu & MacOS) + if: matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest' + run: | + cd dist + ./Movie_Data_Capture diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..09632a9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,114 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# movie files +*.mp4 + +# success/failed folder +JAV_output/**/* +failed/* +.vscode/launch.json + +.idea \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..abd8b96 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,29 @@ +{ + // 使用 IntelliSense 了解相关属性。 + // 悬停以查看现有属性的描述。 + // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: 当前文件", + "type": "python", + "request": "launch", + "console": "integratedTerminal", + "env": { + "PYTHONIOENCODING": "utf-8" + }, + "program": "${workspaceFolder}/Movie_Data_capture.py", + "program4": "${workspaceFolder}/number_parser.py", + "program5": "${workspaceFolder}/config.py", + "cwd0": "${fileDirname}", + "cwd1": "${workspaceFolder}/dist", + "cwd2": "${env:HOME}${env:USERPROFILE}/.mdc", + "args0": ["-a","-p","J:/Downloads","-o","J:/log"], + "args1": ["-g","-m","3","-c","1","-d","0"], + "args3": ["-agd0","-m3", "-q", ".*","-p","J:/#output"], + "args4": ["-gic1", "-d0", "-m3", "-o", "avlog", "-p", "I:/output"], + "args5": ["-gic1", "-d0", "-m1", "-o", "avlog", "-p", "J:/Downloads"], + "args6": ["-z", "-o", "J:/log"] + } + ] +} diff --git a/ADC_function.py b/ADC_function.py new file mode 100644 index 0000000..a5b39f5 --- /dev/null +++ b/ADC_function.py @@ -0,0 +1,602 @@ +# build-in lib +import os.path +import os +import re +import uuid +import json +import time +import typing +from unicodedata import category +from concurrent.futures import ThreadPoolExecutor + +# third party lib +import requests +from requests.adapters import HTTPAdapter +import mechanicalsoup +from pathlib import Path +from urllib3.util.retry import Retry +from lxml import etree +from cloudscraper import create_scraper + +# project wide +import config + + +def get_xpath_single(html_code: str, xpath): + html = etree.fromstring(html_code, etree.HTMLParser()) + result1 = str(html.xpath(xpath)).strip(" ['']") + return result1 + + +G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36' + + +def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None, json_headers=None): + """ + 网页请求核心函数 + """ + verify = config.getInstance().cacert_file() + config_proxy = config.getInstance().proxy() + errors = "" + + headers = {"User-Agent": ua or G_USER_AGENT} # noqa + if json_headers is not None: + headers.update(json_headers) + + for i in range(config_proxy.retry): + try: + if config_proxy.enable: + proxies = config_proxy.proxies() + result = requests.get(str(url), headers=headers, timeout=config_proxy.timeout, proxies=proxies, + verify=verify, + cookies=cookies) + else: + result = requests.get(str(url), headers=headers, timeout=config_proxy.timeout, cookies=cookies) + + if return_type == "object": + return result + elif return_type == "content": + return result.content + else: + result.encoding = encoding or result.apparent_encoding + return result.text + except Exception as e: + print("[-]Connect retry {}/{}".format(i + 1, config_proxy.retry)) + errors = str(e) + if "getaddrinfo failed" in errors: + print("[-]Connect Failed! Please Check your proxy config") + debug = config.getInstance().debug() + if debug: + print("[-]" + errors) + else: + print("[-]" + errors) + print('[-]Connect Failed! Please check your Proxy or Network!') + raise Exception('Connect Failed') + + +def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: + config_proxy = config.getInstance().proxy() + errors = "" + headers_ua = {"User-Agent": G_USER_AGENT} + if headers is None: + headers = headers_ua + else: + headers.update(headers_ua) + + for i in range(config_proxy.retry): + try: + if config_proxy.enable: + proxies = config_proxy.proxies() + result = requests.post(url, data=query, proxies=proxies, headers=headers, timeout=config_proxy.timeout) + else: + result = requests.post(url, data=query, headers=headers, timeout=config_proxy.timeout) + return result + except Exception as e: + print("[-]Connect retry {}/{}".format(i + 1, config_proxy.retry)) + errors = str(e) + print("[-]Connect Failed! Please check your Proxy or Network!") + print("[-]" + errors) + + +G_DEFAULT_TIMEOUT = 10 # seconds + + +class TimeoutHTTPAdapter(HTTPAdapter): + def __init__(self, *args, **kwargs): + self.timeout = G_DEFAULT_TIMEOUT + if "timeout" in kwargs: + self.timeout = kwargs["timeout"] + del kwargs["timeout"] + super().__init__(*args, **kwargs) + + def send(self, request, **kwargs): + timeout = kwargs.get("timeout") + if timeout is None: + kwargs["timeout"] = self.timeout + return super().send(request, **kwargs) + + +# with keep-alive feature +def get_html_session(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None, + encoding: str = None): + config_proxy = config.getInstance().proxy() + session = requests.Session() + if isinstance(cookies, dict) and len(cookies): + requests.utils.add_dict_to_cookiejar(session.cookies, cookies) + retries = Retry(total=config_proxy.retry, connect=config_proxy.retry, backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504]) + session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + if config_proxy.enable: + session.verify = config.getInstance().cacert_file() + session.proxies = config_proxy.proxies() + headers = {"User-Agent": ua or G_USER_AGENT} + session.headers = headers + try: + if isinstance(url, str) and len(url): + result = session.get(str(url)) + else: # 空url参数直接返回可重用session对象,无需设置return_type + return session + if not result.ok: + return None + if return_type == "object": + return result + elif return_type == "content": + return result.content + elif return_type == "session": + return result, session + else: + result.encoding = encoding or "utf-8" + return result.text + except requests.exceptions.ProxyError: + print("[-]get_html_session() Proxy error! Please check your Proxy") + except requests.exceptions.RequestException: + pass + except Exception as e: + print(f"[-]get_html_session() failed. {e}") + return None + + +def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None, + encoding: str = None, use_scraper: bool = False): + config_proxy = config.getInstance().proxy() + s = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if use_scraper else requests.Session() + if isinstance(cookies, dict) and len(cookies): + requests.utils.add_dict_to_cookiejar(s.cookies, cookies) + retries = Retry(total=config_proxy.retry, connect=config_proxy.retry, backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504]) + s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + if config_proxy.enable: + s.verify = config.getInstance().cacert_file() + s.proxies = config_proxy.proxies() + try: + browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s) + if isinstance(url, str) and len(url): + result = browser.open(url) + else: + return browser + if not result.ok: + return None + + if return_type == "object": + return result + elif return_type == "content": + return result.content + elif return_type == "browser": + return result, browser + else: + result.encoding = encoding or "utf-8" + return result.text + except requests.exceptions.ProxyError: + print("[-]get_html_by_browser() Proxy error! Please check your Proxy") + except Exception as e: + print(f'[-]get_html_by_browser() Failed! {e}') + return None + + +def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, + return_type: str = None, encoding: str = None): + config_proxy = config.getInstance().proxy() + s = requests.Session() + if isinstance(cookies, dict) and len(cookies): + requests.utils.add_dict_to_cookiejar(s.cookies, cookies) + retries = Retry(total=config_proxy.retry, connect=config_proxy.retry, backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504]) + s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + if config_proxy.enable: + s.verify = config.getInstance().cacert_file() + s.proxies = config_proxy.proxies() + try: + browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s) + result = browser.open(url) + if not result.ok: + return None + form = browser.select_form() if form_select is None else browser.select_form(form_select) + if isinstance(fields, dict): + for k, v in fields.items(): + browser[k] = v + response = browser.submit_selected() + + if return_type == "object": + return response + elif return_type == "content": + return response.content + elif return_type == "browser": + return response, browser + else: + result.encoding = encoding or "utf-8" + return response.text + except requests.exceptions.ProxyError: + print("[-]get_html_by_form() Proxy error! Please check your Proxy") + except Exception as e: + print(f'[-]get_html_by_form() Failed! {e}') + return None + + +def get_html_by_scraper(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None, + encoding: str = None): + config_proxy = config.getInstance().proxy() + session = create_scraper(browser={'custom': ua or G_USER_AGENT, }) + if isinstance(cookies, dict) and len(cookies): + requests.utils.add_dict_to_cookiejar(session.cookies, cookies) + retries = Retry(total=config_proxy.retry, connect=config_proxy.retry, backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504]) + session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + if config_proxy.enable: + session.verify = config.getInstance().cacert_file() + session.proxies = config_proxy.proxies() + try: + if isinstance(url, str) and len(url): + result = session.get(str(url)) + else: # 空url参数直接返回可重用scraper对象,无需设置return_type + return session + if not result.ok: + return None + if return_type == "object": + return result + elif return_type == "content": + return result.content + elif return_type == "scraper": + return result, session + else: + result.encoding = encoding or "utf-8" + return result.text + except requests.exceptions.ProxyError: + print("[-]get_html_by_scraper() Proxy error! Please check your Proxy") + except Exception as e: + print(f"[-]get_html_by_scraper() failed. {e}") + return None + + +# def get_javlib_cookie() -> [dict, str]: +# import cloudscraper +# switch, proxy, timeout, retry_count, proxytype = config.getInstance().proxy() +# proxies = get_proxy(proxy, proxytype) +# +# raw_cookie = {} +# user_agent = "" +# +# # Get __cfduid/cf_clearance and user-agent +# for i in range(retry_count): +# try: +# if switch == 1 or switch == '1': +# raw_cookie, user_agent = cloudscraper.get_cookie_string( +# "http://www.javlibrary.com/", +# proxies=proxies +# ) +# else: +# raw_cookie, user_agent = cloudscraper.get_cookie_string( +# "http://www.javlibrary.com/" +# ) +# except requests.exceptions.ProxyError: +# print("[-] ProxyError, retry {}/{}".format(i + 1, retry_count)) +# except cloudscraper.exceptions.CloudflareIUAMError: +# print("[-] IUAMError, retry {}/{}".format(i + 1, retry_count)) +# +# return raw_cookie, user_agent + + +def translate( + src: str, + target_language: str = config.getInstance().get_target_language(), + engine: str = config.getInstance().get_translate_engine(), + app_id: str = "", + key: str = "", + delay: int = 0, +) -> str: + """ + translate japanese kana to simplified chinese + 翻译日语假名到简体中文 + :raises ValueError: Non-existent translation engine + """ + trans_result = "" + # 中文句子如果包含&等符号会被谷歌翻译截断损失内容,而且中文翻译到中文也没有意义,故而忽略,只翻译带有日语假名的 + if (is_japanese(src) == False) and ("zh_" in target_language): + return src + if engine == "google-free": + gsite = config.getInstance().get_translate_service_site() + if not re.match('^translate\.google\.(com|com\.\w{2}|\w{2})$', gsite): + gsite = 'translate.google.cn' + url = ( + f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={target_language}&q={src}" + ) + result = get_html(url=url, return_type="object") + if not result.ok: + print('[-]Google-free translate web API calling failed.') + return '' + + translate_list = [i["trans"] for i in result.json()["sentences"]] + trans_result = trans_result.join(translate_list) + elif engine == "azure": + url = "https://api.cognitive.microsofttranslator.com/translate?api-version=3.0&to=" + target_language + headers = { + 'Ocp-Apim-Subscription-Key': key, + 'Ocp-Apim-Subscription-Region': "global", + 'Content-type': 'application/json', + 'X-ClientTraceId': str(uuid.uuid4()) + } + body = json.dumps([{'text': src}]) + result = post_html(url=url, query=body, headers=headers) + translate_list = [i["text"] for i in result.json()[0]["translations"]] + trans_result = trans_result.join(translate_list) + elif engine == "deeplx": + url = config.getInstance().get_translate_service_site() + res = requests.post(f"{url}/translate", json={ + 'text': src, + 'source_lang': 'auto', + 'target_lang': target_language, + }) + if res.text.strip(): + trans_result = res.json().get('data') + else: + raise ValueError("Non-existent translation engine") + + time.sleep(delay) + return trans_result + + +def load_cookies(cookie_json_filename: str) -> typing.Tuple[typing.Optional[dict], typing.Optional[str]]: + """ + 加载cookie,用于以会员方式访问非游客内容 + + :filename: cookie文件名。获取cookie方式:从网站登录后,通过浏览器插件(CookieBro或EdittThisCookie)或者直接在地址栏网站链接信息处都可以复制或者导出cookie内容,以JSON方式保存 + + # 示例: FC2-755670 url https://javdb9.com/v/vO8Mn + # json 文件格式 + # 文件名: 站点名.json,示例 javdb9.json + # 内容(文件编码:UTF-8): + { + "over18":"1", + "redirect_to":"%2Fv%2FvO8Mn", + "remember_me_token":"***********", + "_jdb_session":"************", + "locale":"zh", + "__cfduid":"*********", + "theme":"auto" + } + """ + filename = os.path.basename(cookie_json_filename) + if not len(filename): + return None, None + path_search_order = ( + Path.cwd() / filename, + Path.home() / filename, + Path.home() / f".mdc/{filename}", + Path.home() / f".local/share/mdc/{filename}" + ) + cookies_filename = None + try: + for p in path_search_order: + if p.is_file(): + cookies_filename = str(p.resolve()) + break + if not cookies_filename: + return None, None + return json.loads(Path(cookies_filename).read_text(encoding='utf-8')), cookies_filename + except: + return None, None + + +def file_modification_days(filename: str) -> int: + """ + 文件修改时间距此时的天数 + """ + mfile = Path(filename) + if not mfile.is_file(): + return 9999 + mtime = int(mfile.stat().st_mtime) + now = int(time.time()) + days = int((now - mtime) / (24 * 60 * 60)) + if days < 0: + return 9999 + return days + + +def file_not_exist_or_empty(filepath) -> bool: + return not os.path.isfile(filepath) or os.path.getsize(filepath) == 0 + + +def is_japanese(raw: str) -> bool: + """ + 日语简单检测 + """ + return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', raw, re.UNICODE)) + + +def download_file_with_filename(url: str, filename: str, path: str) -> None: + """ + download file save to give path with given name from given url + """ + conf = config.getInstance() + config_proxy = conf.proxy() + + for i in range(config_proxy.retry): + try: + if config_proxy.enable: + if not os.path.exists(path): + try: + os.makedirs(path) + except: + print(f"[-]Fatal error! Can not make folder '{path}'") + os._exit(0) + r = get_html(url=url, return_type='content') + if r == '': + print('[-]Movie Download Data not found!') + return + with open(os.path.join(path, filename), "wb") as code: + code.write(r) + return + else: + if not os.path.exists(path): + try: + os.makedirs(path) + except: + print(f"[-]Fatal error! Can not make folder '{path}'") + os._exit(0) + r = get_html(url=url, return_type='content') + if r == '': + print('[-]Movie Download Data not found!') + return + with open(os.path.join(path, filename), "wb") as code: + code.write(r) + return + except requests.exceptions.ProxyError: + i += 1 + print('[-]Download : Connect retry ' + str(i) + '/' + str(config_proxy.retry)) + except requests.exceptions.ConnectTimeout: + i += 1 + print('[-]Download : Connect retry ' + str(i) + '/' + str(config_proxy.retry)) + except requests.exceptions.ConnectionError: + i += 1 + print('[-]Download : Connect retry ' + str(i) + '/' + str(config_proxy.retry)) + except requests.exceptions.RequestException: + i += 1 + print('[-]Download : Connect retry ' + str(i) + '/' + str(config_proxy.retry)) + except IOError: + raise ValueError(f"[-]Create Directory '{path}' failed!") + return + print('[-]Connect Failed! Please check your Proxy or Network!') + raise ValueError('[-]Connect Failed! Please check your Proxy or Network!') + return + + +def download_one_file(args) -> str: + """ + download file save to given path from given url + wrapped for map function + """ + + (url, save_path, json_headers) = args + if json_headers is not None: + filebytes = get_html(url, return_type='content', json_headers=json_headers['headers']) + else: + filebytes = get_html(url, return_type='content') + if isinstance(filebytes, bytes) and len(filebytes): + with save_path.open('wb') as fpbyte: + if len(filebytes) == fpbyte.write(filebytes): + return str(save_path) + + +def parallel_download_files(dn_list: typing.Iterable[typing.Sequence], parallel: int = 0, json_headers=None): + """ + download files in parallel 多线程下载文件 + + 用法示例: 2线程同时下载两个不同文件,并保存到不同路径,路径目录可未创建,但需要具备对目标目录和文件的写权限 + parallel_download_files([ + ('https://site1/img/p1.jpg', 'C:/temp/img/p1.jpg'), + ('https://site2/cover/n1.xml', 'C:/tmp/cover/n1.xml') + ]) + + :dn_list: 可以是 tuple或者list: ((url1, save_fullpath1),(url2, save_fullpath2),) fullpath可以是str或Path + :parallel: 并行下载的线程池线程数,为0则由函数自己决定 + """ + mp_args = [] + for url, fullpath in dn_list: + if url and isinstance(url, str) and url.startswith('http') \ + and fullpath and isinstance(fullpath, (str, Path)) and len(str(fullpath)): + fullpath = Path(fullpath) + fullpath.parent.mkdir(parents=True, exist_ok=True) + mp_args.append((url, fullpath, json_headers)) + if not len(mp_args): + return [] + if not isinstance(parallel, int) or parallel not in range(1, 200): + parallel = min(5, len(mp_args)) + with ThreadPoolExecutor(parallel) as pool: + results = list(pool.map(download_one_file, mp_args)) + return results + + +def delete_all_elements_in_list(string: str, lists: typing.Iterable[str]): + """ + delete same string in given list + """ + new_lists = [] + for i in lists: + if i != string: + new_lists.append(i) + return new_lists + + +def delete_all_elements_in_str(string_delete: str, string: str): + """ + delete same string in given list + """ + for i in string: + if i == string_delete: + string = string.replace(i, "") + return string + + +# print format空格填充对齐内容包含中文时的空格计算 +def cn_space(v: str, n: int) -> int: + return n - [category(c) for c in v].count('Lo') + + +""" +Usage: python ./ADC_function.py https://cn.bing.com/ +Purpose: benchmark get_html_session + benchmark get_html_by_scraper + benchmark get_html_by_browser + benchmark get_html +TODO: may be this should move to unittest directory +""" +if __name__ == "__main__": + import sys, timeit + from http.client import HTTPConnection + + + def benchmark(times: int, url): + print(f"HTTP GET Benchmark times:{times} url:{url}") + tm = timeit.timeit(f"_ = session1.get('{url}')", + "from __main__ import get_html_session;session1=get_html_session()", + number=times) + print(f' *{tm:>10.5f}s get_html_session() Keep-Alive enable') + tm = timeit.timeit(f"_ = scraper1.get('{url}')", + "from __main__ import get_html_by_scraper;scraper1=get_html_by_scraper()", + number=times) + print(f' *{tm:>10.5f}s get_html_by_scraper() Keep-Alive enable') + tm = timeit.timeit(f"_ = browser1.open('{url}')", + "from __main__ import get_html_by_browser;browser1=get_html_by_browser()", + number=times) + print(f' *{tm:>10.5f}s get_html_by_browser() Keep-Alive enable') + tm = timeit.timeit(f"_ = get_html('{url}')", + "from __main__ import get_html", + number=times) + print(f' *{tm:>10.5f}s get_html()') + + + # target_url = "https://www.189.cn/" + target_url = "http://www.chinaunicom.com" + HTTPConnection.debuglevel = 1 + html_session = get_html_session() + _ = html_session.get(target_url) + HTTPConnection.debuglevel = 0 + + # times + t = 100 + if len(sys.argv) > 1: + target_url = sys.argv[1] + benchmark(t, target_url) diff --git a/ImageProcessing/__init__.py b/ImageProcessing/__init__.py new file mode 100644 index 0000000..768b1b4 --- /dev/null +++ b/ImageProcessing/__init__.py @@ -0,0 +1,114 @@ +import sys +sys.path.append('../') + +import logging +import os +import config +import importlib +from pathlib import Path +from PIL import Image +import shutil +from ADC_function import file_not_exist_or_empty + + +def face_crop_width(filename, width, height): + aspect_ratio = config.getInstance().face_aspect_ratio() + # 新宽度是高度的2/3 + cropWidthHalf = int(height/3) + try: + locations_model = config.getInstance().face_locations_model().lower().split(',') + locations_model = filter(lambda x: x, locations_model) + for model in locations_model: + center, top = face_center(filename, model) + # 如果找到就跳出循环 + if center: + cropLeft = center-cropWidthHalf + cropRight = center+cropWidthHalf + # 越界处理 + if cropLeft < 0: + cropLeft = 0 + cropRight = cropWidthHalf * aspect_ratio + elif cropRight > width: + cropLeft = width - cropWidthHalf * aspect_ratio + cropRight = width + return (cropLeft, 0, cropRight, height) + except: + print('[-]Not found face! ' + filename) + # 默认靠右切 + return (width-cropWidthHalf * aspect_ratio, 0, width, height) + + +def face_crop_height(filename, width, height): + cropHeight = int(width*3/2) + try: + locations_model = config.getInstance().face_locations_model().lower().split(',') + locations_model = filter(lambda x: x, locations_model) + for model in locations_model: + center, top = face_center(filename, model) + # 如果找到就跳出循环 + if top: + # 头部靠上 + cropTop = top + cropBottom = cropHeight + top + if cropBottom > height: + cropTop = 0 + cropBottom = cropHeight + return (0, cropTop, width, cropBottom) + except: + print('[-]Not found face! ' + filename) + # 默认从顶部向下切割 + return (0, 0, width, cropHeight) + + +def cutImage(imagecut, path, thumb_path, poster_path, skip_facerec=False): + conf = config.getInstance() + fullpath_fanart = os.path.join(path, thumb_path) + fullpath_poster = os.path.join(path, poster_path) + aspect_ratio = conf.face_aspect_ratio() + if conf.face_aways_imagecut(): + imagecut = 1 + elif conf.download_only_missing_images() and not file_not_exist_or_empty(fullpath_poster): + return + # imagecut为4时同时也是有码影片 也用人脸识别裁剪封面 + if imagecut == 1 or imagecut == 4: # 剪裁大封面 + try: + img = Image.open(fullpath_fanart) + width, height = img.size + if width/height > 2/3: # 如果宽度大于2 + if imagecut == 4: + # 以人像为中心切取 + img2 = img.crop(face_crop_width(fullpath_fanart, width, height)) + elif skip_facerec: + # 有码封面默认靠右切 + img2 = img.crop((width - int(height / 3) * aspect_ratio, 0, width, height)) + else: + # 以人像为中心切取 + img2 = img.crop(face_crop_width(fullpath_fanart, width, height)) + elif width/height < 2/3: # 如果高度大于3 + # 从底部向上切割 + img2 = img.crop(face_crop_height(fullpath_fanart, width, height)) + else: # 如果等于2/3 + img2 = img + img2.save(fullpath_poster) + print(f"[+]Image Cutted! {Path(fullpath_poster).name}") + except Exception as e: + print(e) + print('[-]Cover cut failed!') + elif imagecut == 0: # 复制封面 + shutil.copyfile(fullpath_fanart, fullpath_poster) + print(f"[+]Image Copyed! {Path(fullpath_poster).name}") + + +def face_center(filename, model): + try: + mod = importlib.import_module('.' + model, 'ImageProcessing') + return mod.face_center(filename, model) + except Exception as e: + print('[-]Model found face ' + filename) + if config.getInstance().debug() == 1: + logging.error(e) + return (0, 0) + +if __name__ == '__main__': + cutImage(1,'z:/t/','p.jpg','o.jpg') + #cutImage(1,'H:\\test\\','12.jpg','test.jpg') diff --git a/ImageProcessing/cnn.py b/ImageProcessing/cnn.py new file mode 100644 index 0000000..2d190ed --- /dev/null +++ b/ImageProcessing/cnn.py @@ -0,0 +1,8 @@ +import sys +sys.path.append('../') + +from ImageProcessing.hog import face_center as hog_face_center + + +def face_center(filename, model): + return hog_face_center(filename, model) diff --git a/ImageProcessing/hog.py b/ImageProcessing/hog.py new file mode 100644 index 0000000..3a6ca49 --- /dev/null +++ b/ImageProcessing/hog.py @@ -0,0 +1,17 @@ +import face_recognition + + +def face_center(filename, model): + image = face_recognition.load_image_file(filename) + face_locations = face_recognition.face_locations(image, 1, model) + print('[+]Found person [' + str(len(face_locations)) + '] By model hog') + maxRight = 0 + maxTop = 0 + for face_location in face_locations: + top, right, bottom, left = face_location + # 中心点 + x = int((right+left)/2) + if x > maxRight: + maxRight = x + maxTop = top + return maxRight,maxTop diff --git a/Img/4K.png b/Img/4K.png new file mode 100644 index 0000000..d4b67da Binary files /dev/null and b/Img/4K.png differ diff --git a/Img/ISO.png b/Img/ISO.png new file mode 100644 index 0000000..e1a1aec Binary files /dev/null and b/Img/ISO.png differ diff --git a/Img/SUB.png b/Img/SUB.png new file mode 100644 index 0000000..f4af58d Binary files /dev/null and b/Img/SUB.png differ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f288702 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1b70b22 --- /dev/null +++ b/Makefile @@ -0,0 +1,38 @@ +#.PHONY: help prepare-dev test lint run doc + +#VENV_NAME?=venv +#VENV_ACTIVATE=. $(VENV_NAME)/bin/activate +#PYTHON=${VENV_NAME}/bin/python3 +SHELL = /bin/bash + +.DEFAULT: make +make: + @echo "[+]make prepare-dev" + #sudo apt-get -y install python3 python3-pip + pip3 install -r requirements.txt + pip3 install pyinstaller + + #@echo "[+]Set CLOUDSCRAPER_PATH variable" + #export cloudscraper_path=$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1) + + @echo "[+]Pyinstaller make" + pyinstaller --onefile Movie_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ + --hidden-import "ImageProcessing.cnn" \ + --python-option u \ + --add-data "`python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1`:cloudscraper" \ + --add-data "`python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1`:opencc" \ + --add-data "`python3 -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1`:face_recognition_models" \ + --add-data "Img:Img" \ + --add-data "config.ini:." \ + + @echo "[+]Move to bin" + if [ ! -d "./bin" ];then mkdir bin; fi + mv dist/* bin/ + cp config.ini bin/ + rm -rf dist/ + + @echo "[+]Clean cache" + @find . -name '*.pyc' -delete + @find . -name '__pycache__' -type d | xargs rm -fr + @find . -name '.pytest_cache' -type d | xargs rm -fr + rm -rf build/ diff --git a/Movie_Data_Capture.py b/Movie_Data_Capture.py new file mode 100644 index 0000000..d9f0b0a --- /dev/null +++ b/Movie_Data_Capture.py @@ -0,0 +1,677 @@ +import argparse +import json +import os +import random +import re +import sys +import time +import shutil +import typing +import urllib3 +import signal +import platform +import config + +from datetime import datetime, timedelta +from lxml import etree +from pathlib import Path +from opencc import OpenCC + +from scraper import get_data_from_json +from ADC_function import file_modification_days, get_html, parallel_download_files +from number_parser import get_number +from core import core_main, core_main_no_net_op, moveFailedFolder, debug_print + +def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool, bool, str, str]: + conf = config.getInstance() + parser = argparse.ArgumentParser(epilog=f"Load Config file '{conf.ini_path}'.") + parser.add_argument("file", default='', nargs='?', help="Single Movie file path.") + parser.add_argument("-p", "--path", default='', nargs='?', help="Analysis folder path.") + parser.add_argument("-m", "--main-mode", default='', nargs='?', + help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder") + parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.") + # parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.") + parser.add_argument("-L", "--link-mode", default='', nargs='?', + help="Create movie file link. 0:moving movie file, do not create link 1:soft link 2:try hard link first") + default_logdir = str(Path.home() / '.mlogs') + parser.add_argument("-o", "--log-dir", dest='logdir', default=default_logdir, nargs='?', + help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on. + default folder for current user: '{default_logdir}'. Change default folder to an empty file, + or use --log-dir= to turn log off.""") + parser.add_argument("-q", "--regex-query", dest='regexstr', default='', nargs='?', + help="python re module regex filepath filtering.") + parser.add_argument("-d", "--nfo-skip-days", dest='days', default='', nargs='?', + help="Override nfo_skip_days value in config.") + parser.add_argument("-c", "--stop-counter", dest='cnt', default='', nargs='?', + help="Override stop_counter value in config.") + parser.add_argument("-R", "--rerun-delay", dest='delaytm', default='', nargs='?', + help="Delay (eg. 1h10m30s or 60 (second)) time and rerun, until all movies proceed. Note: stop_counter value in config or -c must none zero.") + parser.add_argument("-i", "--ignore-failed-list", action="store_true", help="Ignore failed list '{}'".format( + os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt'))) + parser.add_argument("-a", "--auto-exit", action="store_true", + help="Auto exit after program complete") + parser.add_argument("-g", "--debug", action="store_true", + help="Turn on debug mode to generate diagnostic log for issue report.") + parser.add_argument("-N", "--no-network-operation", action="store_true", + help="No network query, do not get metadata, for cover cropping purposes, only takes effect when main mode is 3.") + parser.add_argument("-w", "--website", dest='site', default='', nargs='?', + help="Override [priority]website= in config.") + parser.add_argument("-D", "--download-images", dest='dnimg', action="store_true", + help="Override [common]download_only_missing_images=0 force invoke image downloading.") + parser.add_argument("-C", "--config-override", dest='cfgcmd', action='append', nargs=1, + help="Common use config override. Grammar: section:key=value[;[section:]key=value] eg. 'de:s=1' or 'debug_mode:switch=1' override[debug_mode]switch=1 Note:this parameters can be used multiple times") + parser.add_argument("-z", "--zero-operation", dest='zero_op', action="store_true", + help="""Only show job list of files and numbers, and **NO** actual operation +is performed. It may help you correct wrong numbers before real job.""") + parser.add_argument("-v", "--version", action="version", version=ver) + parser.add_argument("-s", "--search", default='', nargs='?', help="Search number") + parser.add_argument("-ss", "--specified-source", default='', nargs='?', help="specified Source.") + parser.add_argument("-su", "--specified-url", default='', nargs='?', help="specified Url.") + + args = parser.parse_args() + + def set_natural_number_or_none(sk, value): + if isinstance(value, str) and value.isnumeric() and int(value) >= 0: + conf.set_override(f'{sk}={value}') + + def set_str_or_none(sk, value): + if isinstance(value, str) and len(value): + conf.set_override(f'{sk}={value}') + + def set_bool_or_none(sk, value): + if isinstance(value, bool) and value: + conf.set_override(f'{sk}=1') + + set_natural_number_or_none("common:main_mode", args.main_mode) + set_natural_number_or_none("common:link_mode", args.link_mode) + set_str_or_none("common:source_folder", args.path) + set_bool_or_none("common:auto_exit", args.auto_exit) + set_natural_number_or_none("common:nfo_skip_days", args.days) + set_natural_number_or_none("advenced_sleep:stop_counter", args.cnt) + set_bool_or_none("common:ignore_failed_list", args.ignore_failed_list) + set_str_or_none("advenced_sleep:rerun_delay", args.delaytm) + set_str_or_none("priority:website", args.site) + if isinstance(args.dnimg, bool) and args.dnimg: + conf.set_override("common:download_only_missing_images=0") + set_bool_or_none("debug_mode:switch", args.debug) + if isinstance(args.cfgcmd, list): + for cmd in args.cfgcmd: + conf.set_override(cmd[0]) + + no_net_op = False + if conf.main_mode() == 3: + no_net_op = args.no_network_operation + if no_net_op: + conf.set_override("advenced_sleep:stop_counter=0;advenced_sleep:rerun_delay=0s;face:aways_imagecut=1") + + return args.file, args.number, args.logdir, args.regexstr, args.zero_op, no_net_op, args.search, args.specified_source, args.specified_url + + +class OutLogger(object): + def __init__(self, logfile) -> None: + self.term = sys.stdout + self.log = open(logfile, "w", encoding='utf-8', buffering=1) + self.filepath = logfile + + def __del__(self): + self.close() + + def __enter__(self): + pass + + def __exit__(self, *args): + self.close() + + def write(self, msg): + self.term.write(msg) + self.log.write(msg) + + def flush(self): + if 'flush' in dir(self.term): + self.term.flush() + if 'flush' in dir(self.log): + self.log.flush() + if 'fileno' in dir(self.log): + os.fsync(self.log.fileno()) + + def close(self): + if self.term is not None: + sys.stdout = self.term + self.term = None + if self.log is not None: + self.log.close() + self.log = None + + +class ErrLogger(OutLogger): + + def __init__(self, logfile) -> None: + self.term = sys.stderr + self.log = open(logfile, "w", encoding='utf-8', buffering=1) + self.filepath = logfile + + def close(self): + if self.term is not None: + sys.stderr = self.term + self.term = None + + if self.log is not None: + self.log.close() + self.log = None + + +def dupe_stdout_to_logfile(logdir: str): + if not isinstance(logdir, str) or len(logdir) == 0: + return + log_dir = Path(logdir) + if not log_dir.exists(): + try: + log_dir.mkdir(parents=True, exist_ok=True) + except: + pass + if not log_dir.is_dir(): + return # Tips for disabling logs by change directory to a same name empty regular file + abslog_dir = log_dir.resolve() + log_tmstr = datetime.now().strftime("%Y%m%dT%H%M%S") + logfile = abslog_dir / f'mdc_{log_tmstr}.txt' + errlog = abslog_dir / f'mdc_{log_tmstr}_err.txt' + + sys.stdout = OutLogger(logfile) + sys.stderr = ErrLogger(errlog) + + +def close_logfile(logdir: str): + if not isinstance(logdir, str) or len(logdir) == 0 or not os.path.isdir(logdir): + return + # 日志关闭前保存日志路径 + filepath = None + try: + filepath = sys.stdout.filepath + except: + pass + sys.stdout.close() + sys.stderr.close() + log_dir = Path(logdir).resolve() + if isinstance(filepath, Path): + print(f"Log file '{filepath}' saved.") + assert (filepath.parent.samefile(log_dir)) + # 清理空文件 + for f in log_dir.glob(r'*_err.txt'): + if f.stat().st_size == 0: + try: + f.unlink(missing_ok=True) + except: + pass + # 合并日志 只检测日志目录内的文本日志,忽略子目录。三天前的日志,按日合并为单个日志,三个月前的日志, + # 按月合并为单个月志,去年及以前的月志,今年4月以后将之按年合并为年志 + # 测试步骤: + """ + LOGDIR=/tmp/mlog + mkdir -p $LOGDIR + for f in {2016..2020}{01..12}{01..28};do;echo $f>$LOGDIR/mdc_${f}T235959.txt;done + for f in {01..09}{01..28};do;echo 2021$f>$LOGDIR/mdc_2021${f}T235959.txt;done + for f in {00..23};do;echo 20211001T$f>$LOGDIR/mdc_20211001T${f}5959.txt;done + echo "$(ls -1 $LOGDIR|wc -l) files in $LOGDIR" + # 1932 files in /tmp/mlog + mdc -zgic1 -d0 -m3 -o $LOGDIR + # python3 ./Movie_Data_Capture.py -zgic1 -o $LOGDIR + ls $LOGDIR + # rm -rf $LOGDIR + """ + today = datetime.today() + # 第一步,合并到日。3天前的日志,文件名是同一天的合并为一份日志 + for i in range(1): + txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^mdc_\d{8}T\d{6}$', f.stem, re.A)] + if not txts or not len(txts): + break + e = [f for f in txts if '_err' in f.stem] + txts.sort() + tmstr_3_days_ago = (today.replace(hour=0) - timedelta(days=3)).strftime("%Y%m%dT99") + deadline_day = f'mdc_{tmstr_3_days_ago}' + day_merge = [f for f in txts if f.stem < deadline_day] + if not day_merge or not len(day_merge): + break + cutday = len('T235959.txt') # cut length mdc_20201201|T235959.txt + for f in day_merge: + try: + day_file_name = str(f)[:-cutday] + '.txt' # mdc_20201201.txt + with open(day_file_name, 'a', encoding='utf-8') as m: + m.write(f.read_text(encoding='utf-8')) + f.unlink(missing_ok=True) + except: + pass + # 第二步,合并到月 + for i in range(1): # 利用1次循环的break跳到第二步,避免大块if缩进或者使用goto语法 + txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^mdc_\d{8}$', f.stem, re.A)] + if not txts or not len(txts): + break + txts.sort() + tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3 * 30)).strftime("%Y%m32") + deadline_month = f'mdc_{tmstr_3_month_ago}' + month_merge = [f for f in txts if f.stem < deadline_month] + if not month_merge or not len(month_merge): + break + tomonth = len('01.txt') # cut length mdc_202012|01.txt + for f in month_merge: + try: + month_file_name = str(f)[:-tomonth] + '.txt' # mdc_202012.txt + with open(month_file_name, 'a', encoding='utf-8') as m: + m.write(f.read_text(encoding='utf-8')) + f.unlink(missing_ok=True) + except: + pass + # 第三步,月合并到年 + for i in range(1): + if today.month < 4: + break + mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'^mdc_\d{6}$', f.stem, re.A)] + if not mons or not len(mons): + break + mons.sort() + deadline_year = f'mdc_{today.year - 1}13' + year_merge = [f for f in mons if f.stem < deadline_year] + if not year_merge or not len(year_merge): + break + toyear = len('12.txt') # cut length mdc_2020|12.txt + for f in year_merge: + try: + year_file_name = str(f)[:-toyear] + '.txt' # mdc_2020.txt + with open(year_file_name, 'a', encoding='utf-8') as y: + y.write(f.read_text(encoding='utf-8')) + f.unlink(missing_ok=True) + except: + pass + # 第四步,压缩年志 如果有压缩需求,请自行手工压缩,或者使用外部脚本来定时完成。推荐nongnu的lzip,对于 + # 这种粒度的文本日志,压缩比是目前最好的。lzip -9的运行参数下,日志压缩比要高于xz -9,而且内存占用更少, + # 多核利用率更高(plzip多线程版本),解压速度更快。压缩后的大小差不多是未压缩时的2.4%到3.7%左右, + # 100MB的日志文件能缩小到3.7MB。 + return filepath + + +def signal_handler(*args): + print('[!]Ctrl+C detected, Exit.') + os._exit(9) + + +def sigdebug_handler(*args): + conf = config.getInstance() + conf.set_override(f"debug_mode:switch={int(not conf.debug())}") + print(f"[!]Debug {('oFF', 'On')[int(conf.debug())]}") + + +# 新增失败文件列表跳过处理,及.nfo修改天数跳过处理,提示跳过视频总数,调试模式(-g)下详细被跳过文件,跳过小广告 +def movie_lists(source_folder, regexstr: str) -> typing.List[str]: + conf = config.getInstance() + main_mode = conf.main_mode() + debug = conf.debug() + nfo_skip_days = conf.nfo_skip_days() + link_mode = conf.link_mode() + file_type = conf.media_type().lower().split(",") + trailerRE = re.compile(r'-trailer\.', re.IGNORECASE) + cliRE = None + if isinstance(regexstr, str) and len(regexstr): + try: + cliRE = re.compile(regexstr, re.IGNORECASE) + except: + pass + failed_list_txt_path = Path(conf.failed_folder()).resolve() / 'failed_list.txt' + failed_set = set() + if (main_mode == 3 or link_mode) and not conf.ignore_failed_list(): + try: + flist = failed_list_txt_path.read_text(encoding='utf-8').splitlines() + failed_set = set(flist) + if len(flist) != len(failed_set): # 检查去重并写回,但是不改变failed_list.txt内条目的先后次序,重复的只保留最后的 + fset = failed_set.copy() + for i in range(len(flist) - 1, -1, -1): + fset.remove(flist[i]) if flist[i] in fset else flist.pop(i) + failed_list_txt_path.write_text('\n'.join(flist) + '\n', encoding='utf-8') + assert len(fset) == 0 and len(flist) == len(failed_set) + except: + pass + if not Path(source_folder).is_dir(): + print('[-]Source folder not found!') + return [] + total = [] + source = Path(source_folder).resolve() + skip_failed_cnt, skip_nfo_days_cnt = 0, 0 + escape_folder_set = set(re.split("[,,]", conf.escape_folder())) + for full_name in source.glob(r'**/*'): + if main_mode != 3 and set(full_name.parent.parts) & escape_folder_set: + continue + if not full_name.is_file(): + continue + if not full_name.suffix.lower() in file_type: + continue + absf = str(full_name) + if absf in failed_set: + skip_failed_cnt += 1 + if debug: + print('[!]Skip failed movie:', absf) + continue + is_sym = full_name.is_symlink() + if main_mode != 3 and (is_sym or (full_name.stat().st_nlink > 1 and not conf.scan_hardlink())): # 短路布尔 符号链接不取stat(),因为符号链接可能指向不存在目标 + continue # 模式不等于3下跳过软连接和未配置硬链接刮削 + # 调试用0字节样本允许通过,去除小于120MB的广告'苍老师强力推荐.mp4'(102.2MB)'黑道总裁.mp4'(98.4MB)'有趣的妹子激情表演.MP4'(95MB)'有趣的臺灣妹妹直播.mp4'(15.1MB) + movie_size = 0 if is_sym else full_name.stat().st_size # 同上 符号链接不取stat()及st_size,直接赋0跳过小视频检测 + # if 0 < movie_size < 125829120: # 1024*1024*120=125829120 + # continue + if cliRE and not cliRE.search(absf) or trailerRE.search(full_name.name): + continue + if main_mode == 3: + nfo = full_name.with_suffix('.nfo') + if not nfo.is_file(): + if debug: + print(f"[!]Metadata {nfo.name} not found for '{absf}'") + elif nfo_skip_days > 0 and file_modification_days(nfo) <= nfo_skip_days: + skip_nfo_days_cnt += 1 + if debug: + print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'") + continue + total.append(absf) + + if skip_failed_cnt: + print(f"[!]Skip {skip_failed_cnt} movies in failed list '{failed_list_txt_path}'.") + if skip_nfo_days_cnt: + print( + f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.") + if nfo_skip_days <= 0 or not link_mode or main_mode == 3: + return total + # 软连接方式,已经成功削刮的也需要从成功目录中检查.nfo更新天数,跳过N天内更新过的 + skip_numbers = set() + success_folder = Path(conf.success_folder()).resolve() + for f in success_folder.glob(r'**/*'): + if not re.match(r'\.nfo$', f.suffix, re.IGNORECASE): + continue + if file_modification_days(f) > nfo_skip_days: + continue + number = get_number(False, f.stem) + if not number: + continue + skip_numbers.add(number.lower()) + + rm_list = [] + for f in total: + n_number = get_number(False, os.path.basename(f)) + if n_number and n_number.lower() in skip_numbers: + rm_list.append(f) + for f in rm_list: + total.remove(f) + if debug: + print(f"[!]Skip file successfully processed within {nfo_skip_days} days: '{f}'") + if len(rm_list): + print( + f"[!]Skip {len(rm_list)} movies in success folder '{success_folder}' who's .nfo modified within {nfo_skip_days} days.") + + return total + + +def create_failed_folder(failed_folder: str): + """ + 新建failed文件夹 + """ + if not os.path.exists(failed_folder): + try: + os.makedirs(failed_folder) + except: + print(f"[-]Fatal error! Can not make folder '{failed_folder}'") + os._exit(0) + + +def rm_empty_folder(path): + abspath = os.path.abspath(path) + deleted = set() + for current_dir, subdirs, files in os.walk(abspath, topdown=False): + try: + still_has_subdirs = any(_ for subdir in subdirs if os.path.join(current_dir, subdir) not in deleted) + if not any(files) and not still_has_subdirs and not os.path.samefile(path, current_dir): + os.rmdir(current_dir) + deleted.add(current_dir) + print('[+]Deleting empty folder', current_dir) + except: + pass + + +def create_data_and_move(movie_path: str, zero_op: bool, no_net_op: bool, oCC): + # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4 + debug = config.getInstance().debug() + n_number = get_number(debug, os.path.basename(movie_path)) + movie_path = os.path.abspath(movie_path) + + if debug is True: + print(f"[!] [{n_number}] As Number Processing for '{movie_path}'") + if zero_op: + return + if n_number: + if no_net_op: + core_main_no_net_op(movie_path, n_number) + else: + core_main(movie_path, n_number, oCC) + else: + print("[-] number empty ERROR") + moveFailedFolder(movie_path) + print("[*]======================================================") + else: + try: + print(f"[!] [{n_number}] As Number Processing for '{movie_path}'") + if zero_op: + return + if n_number: + if no_net_op: + core_main_no_net_op(movie_path, n_number) + else: + core_main(movie_path, n_number, oCC) + else: + raise ValueError("number empty") + print("[*]======================================================") + except Exception as err: + print(f"[-] [{movie_path}] ERROR:") + print('[-]', err) + + try: + moveFailedFolder(movie_path) + except Exception as err: + print('[!]', err) + + +def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC, specified_source, specified_url): + conf = config.getInstance() + file_name = os.path.basename(file_path) + try: + print("[!] [{1}] As Number Processing for '{0}'".format(file_path, custom_number)) + if custom_number: + core_main(file_path, custom_number, oCC, specified_source, specified_url) + else: + print("[-] number empty ERROR") + print("[*]======================================================") + except Exception as err: + print("[-] [{}] ERROR:".format(file_path)) + print('[-]', err) + + if conf.link_mode(): + print("[-]Link {} to failed folder".format(file_path)) + os.symlink(file_path, os.path.join(conf.failed_folder(), file_name)) + else: + try: + print("[-]Move [{}] to failed folder".format(file_path)) + shutil.move(file_path, os.path.join(conf.failed_folder(), file_name)) + except Exception as err: + print('[!]', err) + + +def main(args: tuple) -> Path: + (single_file_path, custom_number, logdir, regexstr, zero_op, no_net_op, search, specified_source, + specified_url) = args + conf = config.getInstance() + main_mode = conf.main_mode() + folder_path = "" + if main_mode not in (1, 2, 3): + print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.") + os._exit(4) + + signal.signal(signal.SIGINT, signal_handler) + if sys.platform == 'win32': + signal.signal(signal.SIGBREAK, sigdebug_handler) + else: + signal.signal(signal.SIGWINCH, sigdebug_handler) + dupe_stdout_to_logfile(logdir) + + platform_total = str( + ' - ' + platform.platform() + ' \n[*] - ' + platform.machine() + ' - Python-' + platform.python_version()) + + print('[*]================= Movie Data Capture =================') + print('[*]' + version.center(54)) + print('[*]======================================================') + print('[*]' + platform_total) + print('[*]======================================================') + + start_time = time.time() + print('[+]Start at', time.strftime("%Y-%m-%d %H:%M:%S")) + + print(f"[+]Load Config file '{conf.ini_path}'.") + if conf.debug(): + print('[+]Enable debug') + if conf.link_mode() in (1, 2): + print('[!]Enable {} link'.format(('soft', 'hard')[conf.link_mode() - 1])) + if len(sys.argv) > 1: + print('[!]CmdLine:', " ".join(sys.argv[1:])) + print('[+]Main Working mode ## {}: {} ## {}{}{}' + .format(*(main_mode, ['Scraping', 'Organizing', 'Scraping in analysis folder'][main_mode - 1], + "" if not conf.multi_threading() else ", multi_threading on", + "" if conf.nfo_skip_days() == 0 else f", nfo_skip_days={conf.nfo_skip_days()}", + "" if conf.stop_counter() == 0 else f", stop_counter={conf.stop_counter()}" + ) if not single_file_path else ('-', 'Single File', '', '', '')) + ) + + create_failed_folder(conf.failed_folder()) + + # create OpenCC converter + ccm = conf.cc_convert_mode() + try: + oCC = None if ccm == 0 else OpenCC('t2s.json' if ccm == 1 else 's2t.json') + except: + # some OS no OpenCC cpython, try opencc-python-reimplemented. + # pip uninstall opencc && pip install opencc-python-reimplemented + oCC = None if ccm == 0 else OpenCC('t2s' if ccm == 1 else 's2t') + + if not search == '': + search_list = search.split(",") + for i in search_list: + json_data = get_data_from_json(i, oCC, None, None) + debug_print(json_data) + time.sleep(int(config.getInstance().sleep())) + os._exit(0) + + if not single_file_path == '': # Single File + print('[+]==================== Single File =====================') + if custom_number == '': + create_data_and_move_with_custom_number(single_file_path, + get_number(conf.debug(), os.path.basename(single_file_path)), oCC, + specified_source, specified_url) + else: + create_data_and_move_with_custom_number(single_file_path, custom_number, oCC, + specified_source, specified_url) + else: + folder_path = conf.source_folder() + if not isinstance(folder_path, str) or folder_path == '': + folder_path = os.path.abspath(".") + + movie_list = movie_lists(folder_path, regexstr) + + count = 0 + count_all = str(len(movie_list)) + print('[+]Find', count_all, 'movies.') + print('[*]======================================================') + stop_count = conf.stop_counter() + if stop_count < 1: + stop_count = 999999 + else: + count_all = str(min(len(movie_list), stop_count)) + + for movie_path in movie_list: # 遍历电影列表 交给core处理 + count = count + 1 + percentage = str(count / int(count_all) * 100)[:4] + '%' + print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', + time.strftime("%H:%M:%S"))) + create_data_and_move(movie_path, zero_op, no_net_op, oCC) + if count >= stop_count: + print("[!]Stop counter triggered!") + break + sleep_seconds = random.randint(conf.sleep(), conf.sleep() + 2) + time.sleep(sleep_seconds) + + if conf.del_empty_folder() and not zero_op: + rm_empty_folder(conf.success_folder()) + rm_empty_folder(conf.failed_folder()) + if len(folder_path): + rm_empty_folder(folder_path) + + end_time = time.time() + total_time = str(timedelta(seconds=end_time - start_time)) + print("[+]Running time", total_time[:len(total_time) if total_time.rfind('.') < 0 else -3], + " End at", time.strftime("%Y-%m-%d %H:%M:%S")) + + print("[+]All finished!!!") + + return close_logfile(logdir) + + +def 分析日志文件(logfile): + try: + if not (isinstance(logfile, Path) and logfile.is_file()): + raise FileNotFoundError('log file not found') + logtxt = logfile.read_text(encoding='utf-8') + 扫描电影数 = int(re.findall(r'\[\+]Find (.*) movies\.', logtxt)[0]) + 已处理 = int(re.findall(r'\[1/(.*?)] -', logtxt)[0]) + 完成数 = logtxt.count(r'[+]Wrote!') + return 扫描电影数, 已处理, 完成数 + except: + return None, None, None + + +def period(delta, pattern): + d = {'d': delta.days} + d['h'], rem = divmod(delta.seconds, 3600) + d['m'], d['s'] = divmod(rem, 60) + return pattern.format(**d) + + +if __name__ == '__main__': + version = '6.6.7' + urllib3.disable_warnings() # Ignore http proxy warning + app_start = time.time() + + # Read config.ini first, in argparse_function() need conf.failed_folder() + conf = config.getInstance() + + # Parse command line args + args = tuple(argparse_function(version)) + + 再运行延迟 = conf.rerun_delay() + if 再运行延迟 > 0 and conf.stop_counter() > 0: + while True: + try: + logfile = main(args) + (扫描电影数, 已处理, 完成数) = 分析结果元组 = tuple(分析日志文件(logfile)) + if all(isinstance(v, int) for v in 分析结果元组): + 剩余个数 = 扫描电影数 - 已处理 + 总用时 = timedelta(seconds = time.time() - app_start) + print(f'All movies:{扫描电影数} processed:{已处理} successes:{完成数} remain:{剩余个数}' + + ' Elapsed time {}'.format( + period(总用时, "{d} day {h}:{m:02}:{s:02}") if 总用时.days == 1 + else period(总用时, "{d} days {h}:{m:02}:{s:02}") if 总用时.days > 1 + else period(总用时, "{h}:{m:02}:{s:02}"))) + if 剩余个数 == 0: + break + 下次运行 = datetime.now() + timedelta(seconds=再运行延迟) + print(f'Next run time: {下次运行.strftime("%H:%M:%S")}, rerun_delay={再运行延迟}, press Ctrl+C stop run.') + time.sleep(再运行延迟) + else: + break + except: + break + else: + main(args) + + if not conf.auto_exit(): + if sys.platform == 'win32': + input("Press enter key exit, you can check the error message before you exit...") diff --git a/README.md b/README.md new file mode 100644 index 0000000..7c987ad --- /dev/null +++ b/README.md @@ -0,0 +1,19 @@ +

Movie Data Capture

+ +# 部分代码开源 + +**本地电影元数据 抓取工具 | 刮削器**,配合本地影片管理软件 Emby, Jellyfin, Kodi 等管理本地影片,该软件起到分类与元数据(metadata)抓取作用,利用元数据信息来分类,仅供本地影片分类整理使用。 + +# 申明 +当你查阅、下载了本项目源代码或二进制程序,即代表你接受了以下条款 +* 本项目和项目成果仅供技术,学术交流和Python3性能测试使用 +* 本项目贡献者编写该项目旨在学习Python3 ,提高编程水平 +* 本项目不提供任何影片下载的线索 +* 用户在使用本项目和项目成果前,请用户了解并遵守当地法律法规,如果本项目及项目成果使用过程中存在违反当地法律法规的行为,请勿使用该项目及项目成果 +* 法律后果及使用后果由使用者承担 +* [GPL LICENSE](https://github.com/yoshiko2/Movie_Data_Capture/blob/master/LICENSE) +* 若用户不同意上述条款任意一条,请勿使用本项目和项目成果 + +# 贡献者 +[![](https://opencollective.com/movie_data_capture/contributors.svg?width=890)](https://github.com/yoshiko2/movie_data_Capture/graphs/contributors) + diff --git a/README_EN.md b/README_EN.md new file mode 100644 index 0000000..7a37be9 --- /dev/null +++ b/README_EN.md @@ -0,0 +1,23 @@ +

Movie Data Capture

+ +# Part of the code is open source + +**Movie Metadata Scraper**, with local JAV management software Emby, Jellyfin, Kodi, etc. to manage local movies, +the project plays the role of classification and metadata (metadata) grabbing, using metadata information to classify, only for local movie classification and organization. + +[中文 | Chinese](https://github.com/yoshiko2/Movie_Data_Capture/blob/master/README.md) + +# NOTICE +When you view and download the source code or binary program of this project, it means that you have accepted the following terms: +* **You must be over 18 years old, or leave the page immediately.** +* This project and its results are for technical, academic exchange and Python3 performance testing purposes only. +* The contributors to this project have written this project to learn Python3 and improve programming. +* This project does not provide any movie download trail. +* Legal consequences and the consequences of use are borne by the user. +* [GPL LICENSE](https://github.com/yoshiko2/Movie_Data_Capture/blob/master/LICENSE) +* If you do not agree to any of the above terms, please do not use the project and project results. + +# Contributors +[![](https://opencollective.com/movie_data_capture/contributors.svg?width=890)](https://github.com/yoshiko2/movie_data_Capture/graphs/contributors) + + diff --git a/README_ZH.md b/README_ZH.md new file mode 100644 index 0000000..ad8a987 --- /dev/null +++ b/README_ZH.md @@ -0,0 +1,19 @@ +

Movie Data Capture

+ +# 部分代码开源 + +**本地电影元数据 抓取工具 | 刮削器**,配合本地影片管理软件 Emby, Jellyfin, Kodi 等管理本地影片,该软件起到分类与元数据(metadata)抓取作用,利用元数据信息来分类,仅供本地影片分类整理使用。 + +# 申明 +当你查阅、下载了本项目源代码或二进制程序,即代表你接受了以下条款 +* 本项目和项目成果仅供技术,学术交流和Python3性能测试使用 +* 本项目贡献者编写该项目旨在学习Python3 ,提高编程水平 +* 本项目不提供任何影片下载的线索 +* 用户在使用本项目和项目成果前,请用户了解并遵守当地法律法规,如果本项目及项目成果使用过程中存在违反当地法律法规的行为,请勿使用该项目及项目成果 +* 法律后果及使用后果由使用者承担 +* [GPL LICENSE](https://github.com/yoshiko2/Movie_Data_Capture/blob/master/LICENSE) +* 若用户不同意上述条款任意一条,请勿使用本项目和项目成果 + +# 贡献者 +[![](https://opencollective.com/movie_data_capture/contributors.svg?width=890)](https://github.com/yoshiko2/movie_data_Capture/graphs/contributors) + diff --git a/config.ini b/config.ini new file mode 100755 index 0000000..dd1238d --- /dev/null +++ b/config.ini @@ -0,0 +1,156 @@ +# 详细教程请看 +# - https://github.com/yoshiko2/Movie_Data_Capture/wiki/%E9%85%8D%E7%BD%AE%E6%96%87%E4%BB%B6 +[common] +main_mode = 1 +source_folder = ./ +failed_output_folder = failed +success_output_folder = output +link_mode = 0 +; 0: 不刮削硬链接文件 1: 刮削硬链接文件 +scan_hardlink = 0 +failed_move = 0 +auto_exit = 0 +translate_to_sc = 0 +multi_threading = 0 +;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧) +actor_gender = female +del_empty_folder = 1 +; 跳过最近(默认:30)天新修改过的.NFO,可避免整理模式(main_mode=3)和软连接(soft_link=0)时 +; 反复刮削靠前的视频文件,0为处理所有视频文件 +nfo_skip_days = 30 +ignore_failed_list = 0 +download_only_missing_images = 1 +mapping_table_validity = 7 +; 一些jellyfin中特有的设置 (0:不开启, 1:开启) 比如 +; 在jellyfin中tags和genres重复,因此可以只需保存genres到nfo中 +; jellyfin中只需要保存thumb,不需要保存fanart +jellyfin = 0 +; 开启后tag和genere只显示演员 +actor_only_tag = 0 +sleep = 3 +anonymous_fill = 1 + +[advenced_sleep] +; 处理完多少个视频文件后停止,0为处理所有视频文件 +stop_counter = 0 +; 再运行延迟时间,单位:h时m分s秒 举例: 1h30m45s(1小时30分45秒) 45(45秒) +; stop_counter不为零的条件下才有效,每处理stop_counter部影片后延迟rerun_delay秒再次运行 +rerun_delay = 0 +; 以上参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁 + +[proxy] +;proxytype: http or socks5 or socks5h switch: 0 1 +switch = 0 +type = socks5h +proxy = 127.0.0.1:1080 +timeout = 20 +retry = 3 +cacert_file = + +[Name_Rule] +location_rule = actor+"/"+title +naming_rule = number+"-"+title +max_title_len = 50 +; 刮削后图片是否命名为号码 +image_naming_with_number = 0 +; 番号大写 1 | 0, 仅在写入数据时会进行大写转换, 搜索刮削流程则不影响 +number_uppercase = 0 +; 自定义正则表达式, 多个正则使用空格隔开, 第一个分组为提取的番号, 若自定义正则未能匹配到番号则使用默认规则 +; example: ([A-Za-z]{2,4}\-\d{3}) ([A-Za-z]{2,4}00\d{3}) +number_regexs = + +[update] +update_check = 1 + +[priority] +website = tmdb,imdb + +[escape] +literals = \()/ +folders = failed,output + +[debug_mode] +switch = 0 + +[translate] +switch = 0 +; engine: google-free,azure,deeplx +engine = google-free +; en_us fr_fr de_de... (only google-free now) +target_language = zh_cn +; Azure translate API key +key = +; Translate delay, Bigger Better +delay = 3 +; title,outline,actor,tag +values = title,outline +; Google translate site, or Deeplx site +service_site = translate.google.com + +; 预告片 +[trailer] +switch = 0 + +[uncensored] +uncensored_prefix = PT-,S2M,BT,LAF,SMD,SMBD,SM3D2DBD,SKY-,SKYHD,CWP,CWDV,CWBD,CW3D2DBD,MKD,MKBD,MXBD,MK3D2DBD,MCB3DBD,MCBD,RHJ,MMDV + +[media] +media_type = .mp4,.avi,.rmvb,.wmv,.mov,.mkv,.flv,.ts,.webm,.iso,.mpg,.m4v +sub_type = .smi,.srt,.idx,.sub,.sup,.psb,.ssa,.ass,.usf,.xss,.ssf,.rt,.lrc,.sbv,.vtt,.ttml + +; 水印 +[watermark] +switch = 1 +water = 2 +; 左上 0, 右上 1, 右下 2, 左下 3 + +; 剧照 +[extrafanart] +switch = 1 +parallel_download = 5 +extrafanart_folder = extrafanart + +; 剧情简介 +[storyline] +switch = 1 +; website为javbus javdb avsox xcity carib时,site censored_site uncensored_site 为获取剧情简介信息的 +; 可选数据源站点列表。列表内站点同时并发查询,取值优先级由冒号前的序号决定,从小到大,数字小的站点没数据才会采用后面站点获得的。 +; 其中airavwiki airav avno1 58avgo是中文剧情简介,区别是airav只能查有码,avno1 airavwiki 有码无码都能查, +; 58avgo只能查无码或者流出破解马赛克的影片(此功能没使用)。 +; xcity和amazon是日语的,由于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询, +; 设置成不查询可大幅提高刮削速度。 +; site= +site = +censored_site = +uncensored_site = +; 运行模式:0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快) +run_mode = 1 +; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志),剧情简介失效时可打开2查看原因 +show_result = 0 + +; 繁简转换 繁简转换模式mode=0:不转换 1:繁转简 2:简转繁 +[cc_convert] +mode = 1 +vars = outline,series,studio,tag,title + +[javdb] +sites = 521 + +; 人脸识别 locations_model=hog:方向梯度直方图(不太准确,速度快) cnn:深度学习模型(准确,需要GPU/CUDA,速度慢) +; uncensored_only=0:对全部封面进行人脸识别 1:只识别无码封面,有码封面直接切右半部分 +; aways_imagecut=0:按各网站默认行为 1:总是裁剪封面,开启此项将无视[common]download_only_missing_images=1总是覆盖封面 +; 封面裁剪的宽高比可配置,公式为aspect_ratio/3。默认aspect_ratio=2.12: 适配大部分有码影片封面,前一版本默认为2/3即aspect_ratio=2 +[face] +locations_model = hog +uncensored_only = 1 +aways_imagecut = 0 +aspect_ratio = 2.12 + +[jellyfin] +multi_part_fanart = 0 + +[actor_photo] +download_for_kodi = 0 + +[direct] +switch = 1 diff --git a/config.py b/config.py new file mode 100644 index 0000000..704bc47 --- /dev/null +++ b/config.py @@ -0,0 +1,648 @@ +import os +import re +import sys +import configparser +import time +import typing +from pathlib import Path + +G_conf_override = { + # index 0 save Config() first instance for quick access by using getInstance() + 0: None, + # register override config items + # no need anymore +} + + +def getInstance(): + if isinstance(G_conf_override[0], Config): + return G_conf_override[0] + return Config() + + +class Config: + def __init__(self, path: str = "config.ini"): + path_search_order = ( + Path(path), + Path.cwd() / "config.ini", + Path.home() / "mdc.ini", + Path.home() / ".mdc.ini", + Path.home() / ".mdc/config.ini", + Path.home() / ".config/mdc/config.ini" + ) + ini_path = None + for p in path_search_order: + if p.is_file(): + ini_path = p.resolve() + break + if ini_path: + self.conf = configparser.ConfigParser() + self.ini_path = ini_path + try: + if self.conf.read(ini_path, encoding="utf-8-sig"): + if G_conf_override[0] is None: + G_conf_override[0] = self + except UnicodeDecodeError: + if self.conf.read(ini_path, encoding="utf-8"): + if G_conf_override[0] is None: + G_conf_override[0] = self + except Exception as e: + print("ERROR: Config file can not read!") + print("读取配置文件出错!") + print('=================================') + print(e) + print("======= Auto exit in 60s ======== ") + time.sleep(60) + os._exit(-1) + else: + print("ERROR: Config file not found!") + print("Please put config file into one of the following path:") + print('\n'.join([str(p.resolve()) for p in path_search_order[2:]])) + # 对于找不到配置文件的情况,还是在打包时附上对应版本的默认配置文件,有需要时为其在搜索路径中生成, + # 要比用户乱找一个版本不对应的配置文件会可靠些。这样一来,单个执行文件就是功能完整的了,放在任何 + # 执行路径下都可以放心使用。 + res_path = None + # pyinstaller打包的在打包中找config.ini + if hasattr(sys, '_MEIPASS') and (Path(getattr(sys, '_MEIPASS')) / 'config.ini').is_file(): + res_path = Path(getattr(sys, '_MEIPASS')) / 'config.ini' + # 脚本运行的所在位置找 + elif (Path(__file__).resolve().parent / 'config.ini').is_file(): + res_path = Path(__file__).resolve().parent / 'config.ini' + if res_path is None: + os._exit(2) + ins = input("Or, Do you want me create a config file for you? (Yes/No)[Y]:") + if re.search('n', ins, re.I): + os._exit(2) + # 用户目录才确定具有写权限,因此选择 ~/mdc.ini 作为配置文件生成路径,而不是有可能并没有写权限的 + # 当前目录。目前版本也不再鼓励使用当前路径放置配置文件了,只是作为多配置文件的切换技巧保留。 + write_path = path_search_order[2] # Path.home() / "mdc.ini" + write_path.write_text(res_path.read_text(encoding='utf-8'), encoding='utf-8') + print("Config file '{}' created.".format(write_path.resolve())) + input("Press Enter key exit...") + os._exit(0) + # self.conf = self._default_config() + # try: + # self.conf = configparser.ConfigParser() + # try: # From single crawler debug use only + # self.conf.read('../' + path, encoding="utf-8-sig") + # except: + # self.conf.read('../' + path, encoding="utf-8") + # except Exception as e: + # print("[-]Config file not found! Use the default settings") + # print("[-]",e) + # os._exit(3) + # #self.conf = self._default_config() + + def set_override(self, option_cmd: str): + """ + 通用的参数覆盖选项 -C 配置覆盖串 + 配置覆盖串语法:小节名:键名=值[;[小节名:]键名=值][;[小节名:]键名+=值] 多个键用分号分隔 名称可省略部分尾部字符 + 或 小节名:键名+=值[;[小节名:]键名=值][;[小节名:]键名+=值] 在已有值的末尾追加内容,多个键的=和+=可以交叉出现 + 例子: face:aspect_ratio=2;aways_imagecut=1;priority:website=javdb + 小节名必须出现在开头至少一次,分号后可只出现键名=值,不再出现小节名,如果后续全部键名都属于同一个小节 + 例如配置文件存在两个小节[proxy][priority],那么pro可指代proxy,pri可指代priority + [face] ;face小节下方有4个键名locations_model= uncensored_only= aways_imagecut= aspect_ratio= + l,lo,loc,loca,locat,locati...直到locations_model完整名称都可以用来指代locations_model=键名 + u,un,unc...直到uncensored_only完整名称都可以用来指代uncensored_only=键名 + aw,awa...直到aways_imagecut完整名称都可以用来指代aways_imagecut=键名 + as,asp...aspect_ratio完整名称都可以用来指代aspect_ratio=键名 + a则因为二义性,不是合法的省略键名 + """ + def err_exit(str): + print(str) + os._exit(2) + + sections = self.conf.sections() + sec_name = None + for cmd in option_cmd.split(';'): + syntax_err = True + rex = re.findall(r'^(.*?):(.*?)(=|\+=)(.*)$', cmd, re.U) + if len(rex) and len(rex[0]) == 4: + (sec, key, assign, val) = rex[0] + sec_lo = sec.lower().strip() + key_lo = key.lower().strip() + syntax_err = False + elif sec_name: # 已经出现过一次小节名,属于同一个小节的后续键名可以省略小节名 + rex = re.findall(r'^(.*?)(=|\+=)(.*)$', cmd, re.U) + if len(rex) and len(rex[0]) == 3: + (key, assign, val) = rex[0] + sec_lo = sec_name.lower() + key_lo = key.lower().strip() + syntax_err = False + if syntax_err: + err_exit(f"[-]Config override syntax incorrect. example: 'd:s=1' or 'debug_mode:switch=1'. cmd='{cmd}' all='{option_cmd}'") + if not len(sec_lo): + err_exit(f"[-]Config override Section name '{sec}' is empty! cmd='{cmd}'") + if not len(key_lo): + err_exit(f"[-]Config override Key name '{key}' is empty! cmd='{cmd}'") + if not len(val.strip()): + print(f"[!]Conig overide value '{val}' is empty! cmd='{cmd}'") + sec_name = None + for s in sections: + if not s.lower().startswith(sec_lo): + continue + if sec_name: + err_exit(f"[-]Conig overide Section short name '{sec_lo}' is not unique! dup1='{sec_name}' dup2='{s}' cmd='{cmd}'") + sec_name = s + if sec_name is None: + err_exit(f"[-]Conig overide Section name '{sec}' not found! cmd='{cmd}'") + key_name = None + keys = self.conf[sec_name] + for k in keys: + if not k.lower().startswith(key_lo): + continue + if key_name: + err_exit(f"[-]Conig overide Key short name '{key_lo}' is not unique! dup1='{key_name}' dup2='{k}' cmd='{cmd}'") + key_name = k + if key_name is None: + err_exit(f"[-]Conig overide Key name '{key}' not found! cmd='{cmd}'") + if assign == "+=": + val = keys[key_name] + val + if self.debug(): + print(f"[!]Set config override [{sec_name}]{key_name}={val} by cmd='{cmd}'") + self.conf.set(sec_name, key_name, val) + + def main_mode(self) -> int: + try: + return self.conf.getint("common", "main_mode") + except ValueError: + self._exit("common:main_mode") + + def source_folder(self) -> str: + return self.conf.get("common", "source_folder").replace("\\\\", "/").replace("\\", "/") + + def failed_folder(self) -> str: + return self.conf.get("common", "failed_output_folder").replace("\\\\", "/").replace("\\", "/") + + def success_folder(self) -> str: + return self.conf.get("common", "success_output_folder").replace("\\\\", "/").replace("\\", "/") + + def actor_gender(self) -> str: + return self.conf.get("common", "actor_gender") + + def link_mode(self) -> int: + return self.conf.getint("common", "link_mode") + + def scan_hardlink(self) -> bool: + return self.conf.getboolean("common", "scan_hardlink", fallback=False)#未找到配置选项,默认不刮削 + + def failed_move(self) -> bool: + return self.conf.getboolean("common", "failed_move") + + def auto_exit(self) -> bool: + return self.conf.getboolean("common", "auto_exit") + + def translate_to_sc(self) -> bool: + return self.conf.getboolean("common", "translate_to_sc") + + def multi_threading(self) -> bool: + return self.conf.getboolean("common", "multi_threading") + + def del_empty_folder(self) -> bool: + return self.conf.getboolean("common", "del_empty_folder") + + def nfo_skip_days(self) -> int: + return self.conf.getint("common", "nfo_skip_days", fallback=30) + + def ignore_failed_list(self) -> bool: + return self.conf.getboolean("common", "ignore_failed_list") + + def download_only_missing_images(self) -> bool: + return self.conf.getboolean("common", "download_only_missing_images") + + def mapping_table_validity(self) -> int: + return self.conf.getint("common", "mapping_table_validity") + + def jellyfin(self) -> int: + return self.conf.getint("common", "jellyfin") + + def actor_only_tag(self) -> bool: + return self.conf.getboolean("common", "actor_only_tag") + + def sleep(self) -> int: + return self.conf.getint("common", "sleep") + + def anonymous_fill(self) -> bool: + return self.conf.getint("common", "anonymous_fill") + + def stop_counter(self) -> int: + return self.conf.getint("advenced_sleep", "stop_counter", fallback=0) + + def rerun_delay(self) -> int: + value = self.conf.get("advenced_sleep", "rerun_delay") + if not (isinstance(value, str) and re.match(r'^[\dsmh]+$', value, re.I)): + return 0 # not match '1h30m45s' or '30' or '1s2m1h4s5m' + if value.isnumeric() and int(value) >= 0: + return int(value) + sec = 0 + sec += sum(int(v) for v in re.findall(r'(\d+)s', value, re.I)) + sec += sum(int(v) for v in re.findall(r'(\d+)m', value, re.I)) * 60 + sec += sum(int(v) for v in re.findall(r'(\d+)h', value, re.I)) * 3600 + return sec + + def is_translate(self) -> bool: + return self.conf.getboolean("translate", "switch") + + def is_trailer(self) -> bool: + return self.conf.getboolean("trailer", "switch") + + def is_watermark(self) -> bool: + return self.conf.getboolean("watermark", "switch") + + def is_extrafanart(self) -> bool: + return self.conf.getboolean("extrafanart", "switch") + + def extrafanart_thread_pool_download(self) -> int: + try: + v = self.conf.getint("extrafanart", "parallel_download") + return v if v >= 0 else 5 + except: + return 5 + + def watermark_type(self) -> int: + return int(self.conf.get("watermark", "water")) + + def get_uncensored(self): + try: + sec = "uncensored" + uncensored_prefix = self.conf.get(sec, "uncensored_prefix") + # uncensored_poster = self.conf.get(sec, "uncensored_poster") + return uncensored_prefix + + except ValueError: + self._exit("uncensored") + + def get_extrafanart(self): + try: + extrafanart_download = self.conf.get("extrafanart", "extrafanart_folder") + return extrafanart_download + except ValueError: + self._exit("extrafanart_folder") + + def get_translate_engine(self) -> str: + return self.conf.get("translate", "engine") + + def get_target_language(self) -> str: + return self.conf.get("translate", "target_language") + + # def get_translate_appId(self) ->str: + # return self.conf.get("translate","appid") + + def get_translate_key(self) -> str: + return self.conf.get("translate", "key") + + def get_translate_delay(self) -> int: + return self.conf.getint("translate", "delay") + + def translate_values(self) -> str: + return self.conf.get("translate", "values") + + def get_translate_service_site(self) -> str: + return self.conf.get("translate", "service_site") + + def proxy(self): + try: + sec = "proxy" + switch = self.conf.get(sec, "switch") + proxy = self.conf.get(sec, "proxy") + timeout = self.conf.getint(sec, "timeout") + retry = self.conf.getint(sec, "retry") + proxytype = self.conf.get(sec, "type") + iniProxy = IniProxy(switch, proxy, timeout, retry, proxytype) + return iniProxy + except ValueError: + self._exit("common") + + def cacert_file(self) -> str: + return self.conf.get('proxy', 'cacert_file') + + def media_type(self) -> str: + return self.conf.get('media', 'media_type') + + def sub_rule(self) -> typing.Set[str]: + return set(self.conf.get('media', 'sub_type').lower().split(',')) + + def naming_rule(self) -> str: + return self.conf.get("Name_Rule", "naming_rule") + + def location_rule(self) -> str: + return self.conf.get("Name_Rule", "location_rule") + + def max_title_len(self) -> int: + """ + Maximum title length + """ + try: + return self.conf.getint("Name_Rule", "max_title_len") + except: + return 50 + + def image_naming_with_number(self) -> bool: + try: + return self.conf.getboolean("Name_Rule", "image_naming_with_number") + except: + return False + + def number_uppercase(self) -> bool: + try: + return self.conf.getboolean("Name_Rule", "number_uppercase") + except: + return False + + def number_regexs(self) -> str: + try: + return self.conf.get("Name_Rule", "number_regexs") + except: + return "" + + def update_check(self) -> bool: + try: + return self.conf.getboolean("update", "update_check") + except ValueError: + self._exit("update:update_check") + + def sources(self) -> str: + return self.conf.get("priority", "website") + + def escape_literals(self) -> str: + return self.conf.get("escape", "literals") + + def escape_folder(self) -> str: + return self.conf.get("escape", "folders") + + def debug(self) -> bool: + return self.conf.getboolean("debug_mode", "switch") + + def get_direct(self) -> bool: + return self.conf.getboolean("direct", "switch") + + def is_storyline(self) -> bool: + try: + return self.conf.getboolean("storyline", "switch") + except: + return True + + def storyline_site(self) -> str: + try: + return self.conf.get("storyline", "site") + except: + return "1:avno1,4:airavwiki" + + def storyline_censored_site(self) -> str: + try: + return self.conf.get("storyline", "censored_site") + except: + return "2:airav,5:xcity,6:amazon" + + def storyline_uncensored_site(self) -> str: + try: + return self.conf.get("storyline", "uncensored_site") + except: + return "3:58avgo" + + def storyline_show(self) -> int: + v = self.conf.getint("storyline", "show_result", fallback=0) + return v if v in (0, 1, 2) else 2 if v > 2 else 0 + + def storyline_mode(self) -> int: + return 1 if self.conf.getint("storyline", "run_mode", fallback=1) > 0 else 0 + + def cc_convert_mode(self) -> int: + v = self.conf.getint("cc_convert", "mode", fallback=1) + return v if v in (0, 1, 2) else 2 if v > 2 else 0 + + def cc_convert_vars(self) -> str: + return self.conf.get("cc_convert", "vars", + fallback="actor,director,label,outline,series,studio,tag,title") + + def javdb_sites(self) -> str: + return self.conf.get("javdb", "sites", fallback="38,39") + + def face_locations_model(self) -> str: + return self.conf.get("face", "locations_model", fallback="hog") + + def face_uncensored_only(self) -> bool: + return self.conf.getboolean("face", "uncensored_only", fallback=True) + + def face_aways_imagecut(self) -> bool: + return self.conf.getboolean("face", "aways_imagecut", fallback=False) + + def face_aspect_ratio(self) -> float: + return self.conf.getfloat("face", "aspect_ratio", fallback=2.12) + + def jellyfin_multi_part_fanart(self) -> bool: + return self.conf.getboolean("jellyfin", "multi_part_fanart", fallback=False) + + def download_actor_photo_for_kodi(self) -> bool: + return self.conf.getboolean("actor_photo", "download_for_kodi", fallback=False) + + @staticmethod + def _exit(sec: str) -> None: + print("[-] Read config error! Please check the {} section in config.ini", sec) + input("[-] Press ENTER key to exit.") + exit() + + @staticmethod + def _default_config() -> configparser.ConfigParser: + conf = configparser.ConfigParser() + + sec1 = "common" + conf.add_section(sec1) + conf.set(sec1, "main_mode", "1") + conf.set(sec1, "source_folder", "./") + conf.set(sec1, "failed_output_folder", "failed") + conf.set(sec1, "success_output_folder", "JAV_output") + conf.set(sec1, "link_mode", "0") + conf.set(sec1, "scan_hardlink", "0") + conf.set(sec1, "failed_move", "1") + conf.set(sec1, "auto_exit", "0") + conf.set(sec1, "translate_to_sc", "1") + # actor_gender value: female or male or both or all(含人妖) + conf.set(sec1, "actor_gender", "female") + conf.set(sec1, "del_empty_folder", "1") + conf.set(sec1, "nfo_skip_days", "30") + conf.set(sec1, "ignore_failed_list", "0") + conf.set(sec1, "download_only_missing_images", "1") + conf.set(sec1, "mapping_table_validity", "7") + conf.set(sec1, "jellyfin", "0") + conf.set(sec1, "actor_only_tag", "0") + conf.set(sec1, "sleep", "3") + conf.set(sec1, "anonymous_fill", "0") + + sec2 = "advenced_sleep" + conf.add_section(sec2) + conf.set(sec2, "stop_counter", "0") + conf.set(sec2, "rerun_delay", "0") + + sec3 = "proxy" + conf.add_section(sec3) + conf.set(sec3, "proxy", "") + conf.set(sec3, "timeout", "5") + conf.set(sec3, "retry", "3") + conf.set(sec3, "type", "socks5") + conf.set(sec3, "cacert_file", "") + + sec4 = "Name_Rule" + conf.add_section(sec4) + conf.set(sec4, "location_rule", "actor + '/' + number") + conf.set(sec4, "naming_rule", "number + '-' + title") + conf.set(sec4, "max_title_len", "50") + conf.set(sec4, "image_naming_with_number", "0") + conf.set(sec4, "number_uppercase", "0") + conf.set(sec4, "number_regexs", "") + + sec5 = "update" + conf.add_section(sec5) + conf.set(sec5, "update_check", "1") + + sec6 = "priority" + conf.add_section(sec6) + conf.set(sec6, "website", "airav,javbus,javdb,fanza,xcity,mgstage,fc2,fc2club,avsox,jav321,xcity") + + sec7 = "escape" + conf.add_section(sec7) + conf.set(sec7, "literals", "\()/") # noqa + conf.set(sec7, "folders", "failed, JAV_output") + + sec8 = "debug_mode" + conf.add_section(sec8) + conf.set(sec8, "switch", "0") + + sec9 = "translate" + conf.add_section(sec9) + conf.set(sec9, "switch", "0") + conf.set(sec9, "engine", "google-free") + conf.set(sec9, "target_language", "zh_cn") + # conf.set(sec8, "appid", "") + conf.set(sec9, "key", "") + conf.set(sec9, "delay", "1") + conf.set(sec9, "values", "title,outline") + conf.set(sec9, "service_site", "translate.google.cn") + + sec10 = "trailer" + conf.add_section(sec10) + conf.set(sec10, "switch", "0") + + sec11 = "uncensored" + conf.add_section(sec11) + conf.set(sec11, "uncensored_prefix", "S2M,BT,LAF,SMD") + + sec12 = "media" + conf.add_section(sec12) + conf.set(sec12, "media_type", + ".mp4,.avi,.rmvb,.wmv,.mov,.mkv,.flv,.ts,.webm,iso") + conf.set(sec12, "sub_type", + ".smi,.srt,.idx,.sub,.sup,.psb,.ssa,.ass,.usf,.xss,.ssf,.rt,.lrc,.sbv,.vtt,.ttml") + + sec13 = "watermark" + conf.add_section(sec13) + conf.set(sec13, "switch", "1") + conf.set(sec13, "water", "2") + + sec14 = "extrafanart" + conf.add_section(sec14) + conf.set(sec14, "switch", "1") + conf.set(sec14, "extrafanart_folder", "extrafanart") + conf.set(sec14, "parallel_download", "1") + + sec15 = "storyline" + conf.add_section(sec15) + conf.set(sec15, "switch", "1") + conf.set(sec15, "site", "1:avno1,4:airavwiki") + conf.set(sec15, "censored_site", "2:airav,5:xcity,6:amazon") + conf.set(sec15, "uncensored_site", "3:58avgo") + conf.set(sec15, "show_result", "0") + conf.set(sec15, "run_mode", "1") + conf.set(sec15, "cc_convert", "1") + + sec16 = "cc_convert" + conf.add_section(sec16) + conf.set(sec16, "mode", "1") + conf.set(sec16, "vars", "actor,director,label,outline,series,studio,tag,title") + + sec17 = "javdb" + conf.add_section(sec17) + conf.set(sec17, "sites", "33,34") + + sec18 = "face" + conf.add_section(sec18) + conf.set(sec18, "locations_model", "hog") + conf.set(sec18, "uncensored_only", "1") + conf.set(sec18, "aways_imagecut", "0") + conf.set(sec18, "aspect_ratio", "2.12") + + sec19 = "jellyfin" + conf.add_section(sec19) + conf.set(sec19, "multi_part_fanart", "0") + + sec20 = "actor_photo" + conf.add_section(sec20) + conf.set(sec20, "download_for_kodi", "0") + + return conf + + +class IniProxy(): + """ Proxy Config from .ini + """ + SUPPORT_PROXY_TYPE = ("http", "socks5", "socks5h") + + enable = False + address = "" + timeout = 5 + retry = 3 + proxytype = "socks5" + + def __init__(self, switch, address, timeout, retry, proxytype) -> None: + """ Initial Proxy from .ini + """ + if switch == '1' or switch == 1: + self.enable = True + self.address = address + self.timeout = timeout + self.retry = retry + self.proxytype = proxytype + + def proxies(self): + """ + 获得代理参数,默认http代理 + get proxy params, use http proxy for default + """ + if self.address: + if self.proxytype in self.SUPPORT_PROXY_TYPE: + proxies = {"http": self.proxytype + "://" + self.address, + "https": self.proxytype + "://" + self.address} + else: + proxies = {"http": "http://" + self.address, "https": "https://" + self.address} + else: + proxies = {} + + return proxies + + +if __name__ == "__main__": + def evprint(evstr): + code = compile(evstr, "", "eval") + print('{}: "{}"'.format(evstr, eval(code))) + + + config = Config() + mfilter = {'conf', 'proxy', '_exit', '_default_config', 'ini_path', 'set_override'} + for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]: + evprint(f'config.{_m}()') + pfilter = {'proxies', 'SUPPORT_PROXY_TYPE'} + # test getInstance() + assert (getInstance() == config) + for _p in [p for p in dir(getInstance().proxy()) if not p.startswith('__') and p not in pfilter]: + evprint(f'getInstance().proxy().{_p}') + + # Create new instance + conf2 = Config() + assert getInstance() != conf2 + assert getInstance() == config + + conf2.set_override("d:s=1;face:asp=2;f:aw=0;pri:w=javdb;f:l=") + assert conf2.face_aspect_ratio() == 2 + assert conf2.face_aways_imagecut() == False + assert conf2.sources() == "javdb" + print(f"Load Config file '{conf2.ini_path}'.") diff --git a/core.py b/core.py new file mode 100644 index 0000000..7edc03d --- /dev/null +++ b/core.py @@ -0,0 +1,1016 @@ +import os.path +import pathlib +import shutil +import sys + +from PIL import Image +from io import BytesIO +from datetime import datetime +# from videoprops import get_video_properties + +from ADC_function import * +from scraper import get_data_from_json +from number_parser import is_uncensored +from ImageProcessing import cutImage + + +# from WebCrawler import get_data_from_json + +def escape_path(path, escape_literals: str): # Remove escape literals + backslash = '\\' + for literal in escape_literals: + path = path.replace(backslash + literal, '') + return path + + +def moveFailedFolder(filepath): + conf = config.getInstance() + failed_folder = conf.failed_folder() + link_mode = conf.link_mode() + # 模式3或软连接,改为维护一个失败列表,启动扫描时加载用于排除该路径,以免反复处理 + # 原先的创建软连接到失败目录,并不直观,不方便找到失败文件位置,不如直接记录该文件路径 + if conf.main_mode() == 3 or link_mode: + ftxt = os.path.abspath(os.path.join(failed_folder, 'failed_list.txt')) + print("[-]Add to Failed List file, see '%s'" % ftxt) + with open(ftxt, 'a', encoding='utf-8') as flt: + flt.write(f'{filepath}\n') + elif conf.failed_move() and not link_mode: + failed_name = os.path.join(failed_folder, os.path.basename(filepath)) + mtxt = os.path.abspath(os.path.join(failed_folder, 'where_was_i_before_being_moved.txt')) + print("'[-]Move to Failed output folder, see '%s'" % mtxt) + with open(mtxt, 'a', encoding='utf-8') as wwibbmt: + tmstr = datetime.now().strftime("%Y-%m-%d %H:%M") + wwibbmt.write(f'{tmstr} FROM[{filepath}]TO[{failed_name}]\n') + try: + if os.path.exists(failed_name): + print('[-]File Exists while moving to FailedFolder') + return + shutil.move(filepath, failed_name) + except: + print('[-]File Moving to FailedFolder unsuccessful!') + + +def get_info(json_data): # 返回json里的数据 + title = json_data.get('title') + studio = json_data.get('studio') + year = json_data.get('year') + outline = json_data.get('outline') + runtime = json_data.get('runtime') + director = json_data.get('director') + actor_photo = json_data.get('actor_photo', {}) + release = json_data.get('release') + number = json_data.get('number') + cover = json_data.get('cover') + trailer = json_data.get('trailer') + website = json_data.get('website') + series = json_data.get('series') + label = json_data.get('label', "") + return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label + + +def small_cover_check(path, filename, cover_small, movie_path, json_headers=None): + full_filepath = Path(path) / filename + if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(str(full_filepath)): + return + if json_headers != None: + download_file_with_filename(cover_small, filename, path, movie_path, json_headers['headers']) + else: + download_file_with_filename(cover_small, filename, path, movie_path) + print('[+]Image Downloaded! ' + full_filepath.name) + + +def create_folder(json_data): # 创建文件夹 + title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info( + json_data) + conf = config.getInstance() + success_folder = conf.success_folder() + actor = json_data.get('actor') + location_rule = eval(conf.location_rule(), json_data) + if 'actor' in conf.location_rule() and len(actor) > 100: + print(conf.location_rule()) + location_rule = eval(conf.location_rule().replace("actor", "'多人作品'"), json_data) + maxlen = conf.max_title_len() + if 'title' in conf.location_rule() and len(title) > maxlen: + shorttitle = title[0:maxlen] + location_rule = location_rule.replace(title, shorttitle) + # 当演员为空时,location_rule被计算为'/number'绝对路径,导致路径连接忽略第一个路径参数,因此添加./使其始终为相对路径 + path = os.path.join(success_folder, f'./{location_rule.strip()}') + if not os.path.exists(path): + path = escape_path(path, conf.escape_literals()) + try: + os.makedirs(path) + except: + path = success_folder + '/' + location_rule.replace('/[' + number + ')-' + title, "/number") + path = escape_path(path, conf.escape_literals()) + try: + os.makedirs(path) + except: + print(f"[-]Fatal error! Can not make folder '{path}'") + os._exit(0) + + return os.path.normpath(path) + + +# =====================资源下载部分=========================== + +# path = examle:photo , video.in the Project Folder! +def download_file_with_filename(url, filename, path, filepath, json_headers=None): + conf = config.getInstance() + configProxy = conf.proxy() + + for i in range(configProxy.retry): + try: + if not os.path.exists(path): + try: + os.makedirs(path) + except: + print(f"[-]Fatal error! Can not make folder '{path}'") + os._exit(0) + r = get_html(url=url, return_type='content', json_headers=json_headers) + if r == '': + print('[-]Movie Download Data not found!') + return + with open(os.path.join(path, filename), "wb") as code: + code.write(r) + return + except requests.exceptions.ProxyError: + i += 1 + print('[-]Image Download : Proxy error ' + str(i) + '/' + str(configProxy.retry)) + # except IOError: + # print(f"[-]Create Directory '{path}' failed!") + # moveFailedFolder(filepath) + # return + except Exception as e: + print('[-]Image Download :Error', e) + print('[-]Connect Failed! Please check your Proxy or Network!') + moveFailedFolder(filepath) + return + + +def trailer_download(trailer, leak_word, c_word, hack_word, number, path, filepath): + if download_file_with_filename(trailer, number + leak_word + c_word + hack_word + '-trailer.mp4', path, + filepath) == 'failed': + return + configProxy = config.getInstance().proxy() + for i in range(configProxy.retry): + if file_not_exist_or_empty(path + '/' + number + leak_word + c_word + hack_word + '-trailer.mp4'): + print('[!]Video Download Failed! Trying again. [{}/3]', i + 1) + download_file_with_filename(trailer, number + leak_word + c_word + hack_word + '-trailer.mp4', path, + filepath) + continue + else: + break + if file_not_exist_or_empty(path + '/' + number + leak_word + c_word + hack_word + '-trailer.mp4'): + return + print('[+]Video Downloaded!', path + '/' + number + leak_word + c_word + hack_word + '-trailer.mp4') + + +def actor_photo_download(actors, save_dir, number): + if not isinstance(actors, dict) or not len(actors) or not len(save_dir): + return + save_dir = Path(save_dir) + if not save_dir.is_dir(): + return + conf = config.getInstance() + actors_dir = save_dir / '.actors' + download_only_missing_images = conf.download_only_missing_images() + dn_list = [] + for actor_name, url in actors.items(): + res = re.match(r'^http.*(\.\w+)$', url, re.A) + if not res: + continue + ext = res.group(1) + pic_fullpath = actors_dir / f'{actor_name}{ext}' + if download_only_missing_images and not file_not_exist_or_empty(pic_fullpath): + continue + dn_list.append((url, pic_fullpath)) + if not len(dn_list): + return + parallel = min(len(dn_list), conf.extrafanart_thread_pool_download()) + if parallel > 100: + print('[!]Warrning: Parallel download thread too large may cause website ban IP!') + result = parallel_download_files(dn_list, parallel) + failed = 0 + for i, r in enumerate(result): + if not r: + failed += 1 + print(f"[-]Actor photo '{dn_list[i][0]}' to '{dn_list[i][1]}' download failed!") + if failed: # 非致命错误,电影不移入失败文件夹,将来可以用模式3补齐 + print( + f"[-]Failed downloaded {failed}/{len(result)} actor photo for [{number}] to '{actors_dir}', you may retry run mode 3 later.") + else: + print(f"[+]Successfully downloaded {len(result)} actor photo.") + + +# 剧照下载成功,否则移动到failed +def extrafanart_download(data, path, number, filepath, json_data=None): + if config.getInstance().extrafanart_thread_pool_download(): + return extrafanart_download_threadpool(data, path, number, json_data) + extrafanart_download_one_by_one(data, path, filepath, json_data) + + +def extrafanart_download_one_by_one(data, path, filepath, json_data=None): + tm_start = time.perf_counter() + j = 1 + conf = config.getInstance() + path = os.path.join(path, conf.get_extrafanart()) + configProxy = conf.proxy() + download_only_missing_images = conf.download_only_missing_images() + for url in data: + jpg_filename = f'extrafanart-{j}.jpg' + jpg_fullpath = os.path.join(path, jpg_filename) + if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath): + continue + if download_file_with_filename(url, jpg_filename, path, filepath, json_data) == 'failed': + moveFailedFolder(filepath) + return + for i in range(configProxy.retry): + if file_not_exist_or_empty(jpg_fullpath): + print('[!]Image Download Failed! Trying again. [{}/3]', i + 1) + download_file_with_filename(url, jpg_filename, path, filepath, json_data) + continue + else: + break + if file_not_exist_or_empty(jpg_fullpath): + return + print('[+]Image Downloaded!', Path(jpg_fullpath).name) + j += 1 + if conf.debug(): + print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s') + + +def extrafanart_download_threadpool(url_list, save_dir, number, json_data=None): + tm_start = time.perf_counter() + conf = config.getInstance() + extrafanart_dir = Path(save_dir) / conf.get_extrafanart() + download_only_missing_images = conf.download_only_missing_images() + dn_list = [] + for i, url in enumerate(url_list, start=1): + jpg_fullpath = extrafanart_dir / f'extrafanart-{i}.jpg' + if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath): + continue + dn_list.append((url, jpg_fullpath)) + if not len(dn_list): + return + parallel = min(len(dn_list), conf.extrafanart_thread_pool_download()) + if parallel > 100: + print('[!]Warrning: Parallel download thread too large may cause website ban IP!') + result = parallel_download_files(dn_list, parallel, json_data) + failed = 0 + for i, r in enumerate(result, start=1): + if not r: + failed += 1 + print(f'[-]Extrafanart {i} for [{number}] download failed!') + if failed: # 非致命错误,电影不移入失败文件夹,将来可以用模式3补齐 + print( + f"[-]Failed downloaded {failed}/{len(result)} extrafanart images for [{number}] to '{extrafanart_dir}', you may retry run mode 3 later.") + else: + print(f"[+]Successfully downloaded {len(result)} extrafanarts.") + if conf.debug(): + print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s') + + +def image_ext(url): + try: + ext = os.path.splitext(url)[-1] + if ext in {'.jpg', '.jpge', '.bmp', '.png', '.gif'}: + return ext + return ".jpg" + except: + return ".jpg" + + +# 封面是否下载成功,否则移动到failed +def image_download(cover, fanart_path, thumb_path, path, filepath, json_headers=None): + full_filepath = os.path.join(path, thumb_path) + if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(full_filepath): + return + if json_headers != None: + if download_file_with_filename(cover, thumb_path, path, filepath, json_headers['headers']) == 'failed': + moveFailedFolder(filepath) + return + else: + if download_file_with_filename(cover, thumb_path, path, filepath) == 'failed': + moveFailedFolder(filepath) + return + + configProxy = config.getInstance().proxy() + for i in range(configProxy.retry): + if file_not_exist_or_empty(full_filepath): + print('[!]Image Download Failed! Trying again. [{}/3]', i + 1) + if json_headers != None: + download_file_with_filename(cover, thumb_path, path, filepath, json_headers['headers']) + else: + download_file_with_filename(cover, thumb_path, path, filepath) + continue + else: + break + if file_not_exist_or_empty(full_filepath): + return + print('[+]Image Downloaded!', Path(full_filepath).name) + if not config.getInstance().jellyfin(): + shutil.copyfile(full_filepath, os.path.join(path, fanart_path)) + + +def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, + uncensored, hack, hack_word, _4k, fanart_path, poster_path, thumb_path, iso): + + conf = config.getInstance() + title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info( + json_data) + if config.getInstance().main_mode() == 3: # 模式3下,由于视频文件不做任何改变,.nfo文件必须和视频文件名称除后缀外完全一致,KODI等软件方可支持 + nfo_path = str(Path(filepath).with_suffix('.nfo')) + else: + nfo_path = os.path.join(path, f"{number}{part}{leak_word}{c_word}{hack_word}.nfo") + try: + if not os.path.exists(path): + try: + os.makedirs(path) + except: + print(f"[-]Fatal error! can not make folder '{path}'") + os._exit(0) + + old_nfo = None + try: + if os.path.isfile(nfo_path): + old_nfo = etree.parse(nfo_path) + except: + pass + # KODI内查看影片信息时找不到number,配置naming_rule=number+'#'+title虽可解决 + # 但使得标题太长,放入时常为空的outline内会更适合,软件给outline留出的显示版面也较大 + if not outline: + pass + elif json_data['source'] == 'pissplay': + outline = f"{outline}" + else: + outline = f"{number}#{outline}" + with open(nfo_path, "wt", encoding='UTF-8') as code: + print('', file=code) + print("", file=code) + if not config.getInstance().jellyfin(): + print(" <![CDATA[" + naming_rule + "]]>", file=code) + print(" ", + file=code) + print(" ", file=code) + else: + print(" " + naming_rule + "", file=code) + print(" " + json_data['original_naming_rule'] + "", file=code) + print(" " + naming_rule + "", file=code) + print(" JP-18+", file=code) + print(" JP-18+", file=code) + try: + print(" " + series + "", file=code) + except: + print(" ", file=code) + print(" " + studio + "", file=code) + print(" " + year + "", file=code) + if not config.getInstance().jellyfin(): + print(" ", file=code) + print(" ", file=code) + else: + print(" " + outline + "", file=code) + print(" " + outline + "", file=code) + print(" " + str(runtime).replace(" ", "") + "", file=code) + + if False != conf.get_direct(): + print(" " + director + "", file=code) + + print(" " + poster_path + "", file=code) + print(" " + thumb_path + "", file=code) + if not config.getInstance().jellyfin(): # jellyfin 不需要保存fanart + print(" " + fanart_path + "", file=code) + try: + for key in actor_list: + print(" ", file=code) + print(" " + key + "", file=code) + try: + print(" " + actor_photo.get(str(key)) + "", file=code) + except: + pass + print(" ", file=code) + except: + pass + print(" " + studio + "", file=code) + print(" ", file=code) + + jellyfin = config.getInstance().jellyfin() + if not jellyfin: + if config.getInstance().actor_only_tag(): + for key in actor_list: + try: + print(" " + key + "", file=code) + except: + pass + else: + if cn_sub: + print(" 中文字幕", file=code) + if _4k: + print(" 4k", file=code) + if iso: + print(" 原盘", file=code) + for i in tag: + print(" " + i + "", file=code) + if cn_sub: + print(" 中文字幕", file=code) + if _4k: + print(" 4k", file=code) + try: + for i in tag: + print(" " + i + "", file=code) + except: + pass + print(" " + number + "", file=code) + print(" " + release + "", file=code) + print(" " + release + "", file=code) + print(" " + release + "", file=code) + if old_nfo: + try: + xur = old_nfo.xpath('//userrating/text()')[0] + if isinstance(xur, str) and re.match('\d+\.\d+|\d+', xur.strip()): + print(f" {xur.strip()}", file=code) + except: + pass + try: + f_rating = json_data.get('userrating') + uc = json_data.get('uservotes') + print(f""" {round(f_rating * 2.0, 1)} + {round(f_rating * 20.0, 1)} + + + {f_rating} + {uc} + + """, file=code) + except: + if old_nfo: + try: + for rtag in ('rating', 'criticrating'): + xur = old_nfo.xpath(f'//{rtag}/text()')[0] + if isinstance(xur, str) and re.match('\d+\.\d+|\d+', xur.strip()): + print(f" <{rtag}>{xur.strip()}", file=code) + f_rating = old_nfo.xpath(f"//ratings/rating[@name='jdb']/value/text()")[0] + uc = old_nfo.xpath(f"//ratings/rating[@name='jdb']/votes/text()")[0] + print(f""" + + {f_rating} + {uc} + + """, file=code) + except: + pass + if config.getInstance().is_trailer(): + print(" " + trailer + "", file=code) + print("", file=code) + print("[+]Wrote! " + nfo_path) + except IOError as e: + print("[-]Write Failed!") + print("[-]", e) + moveFailedFolder(filepath) + return + except Exception as e1: + print("[-]Write Failed!") + print("[-]", e1) + moveFailedFolder(filepath) + return + + +def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, hack, _4k, iso) -> None: + """ + add watermark on poster or thumb for describe extra properties 给海报和缩略图加属性水印 + + :poster_path 海报位置 + :thumb_path 缩略图位置 + :cn_sub: 中文字幕 可选值:1,"1" 或其他值 + :uncensored 无码 可选值:1,"1" 或其他值 + :hack 破解 可选值:1,"1" 或其他值 + :_4k Bool + """ + mark_type = '' + if cn_sub: + mark_type += ',字幕' + if _4k: + mark_type += ',4k' + if iso: + mark_type += ',iso' + if mark_type == '': + return + add_mark_thread(thumb_path, cn_sub, leak, uncensored, hack, _4k, iso) + add_mark_thread(poster_path, cn_sub, leak, uncensored, hack, _4k, iso) + print('[+]Add Mark: ' + mark_type.strip(',')) + + +def add_mark_thread(pic_path, cn_sub, leak, uncensored, hack, _4k, iso): + size = 9 + img_pic = Image.open(pic_path) + # 获取自定义位置,取余配合pos达到顺时针添加的效果 + # 左上 0, 右上 1, 右下 2, 左下 3 + count = config.getInstance().watermark_type() + if cn_sub: + add_to_pic(pic_path, img_pic, size, count, 1) # 添加 + count = (count + 1) % 4 + if leak: + add_to_pic(pic_path, img_pic, size, count, 2) + count = (count + 1) % 4 + if uncensored: + add_to_pic(pic_path, img_pic, size, count, 3) + count = (count + 1) % 4 + if hack: + add_to_pic(pic_path, img_pic, size, count, 4) + count = (count + 1) % 4 + if _4k: + add_to_pic(pic_path, img_pic, size, count, 5) + count = (count + 1) % 4 + if iso: + add_to_pic(pic_path, img_pic, size, count, 6) + img_pic.close() + + +def add_to_pic(pic_path, img_pic, size, count, mode): + mark_pic_path = '' + pngpath = '' + if mode == 1: + pngpath = "Img/SUB.png" + elif mode == 5: + pngpath = "Img/4K.png" + elif mode == 6: + pngpath = "Img/ISO.png" + else: + print('[-]Error: watermark image param mode invalid!') + return + # 先找pyinstaller打包的图片 + if hasattr(sys, '_MEIPASS') and os.path.isfile(os.path.join(getattr(sys, '_MEIPASS'), pngpath)): + mark_pic_path = os.path.join(getattr(sys, '_MEIPASS'), pngpath) + # 再找py脚本所在路径的图片 + elif os.path.isfile(os.path.join(os.path.dirname(os.path.realpath(__file__)), pngpath)): + mark_pic_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), pngpath) + # 如果没有本地图片才通过网络下载 + else: + mark_pic_path = BytesIO( + get_html("https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/" + pngpath, + return_type="content")) + img_subt = Image.open(mark_pic_path) + scroll_high = int(img_pic.height / size) + scroll_wide = int(scroll_high * img_subt.width / img_subt.height) + img_subt = img_subt.resize((scroll_wide, scroll_high), Image.LANCZOS) + r, g, b, a = img_subt.split() # 获取颜色通道,保持png的透明性 + # 封面四个角的位置 + pos = [ + {'x': 0, 'y': 0}, + {'x': img_pic.width - scroll_wide, 'y': 0}, + {'x': img_pic.width - scroll_wide, 'y': img_pic.height - scroll_high}, + {'x': 0, 'y': img_pic.height - scroll_high}, + ] + img_pic.paste(img_subt, (pos[count]['x'], pos[count]['y']), mask=a) + img_pic.save(pic_path, quality=95) + + +# ========================结束================================= + + +def paste_file_to_folder(filepath, path, multi_part, number, part, leak_word, c_word, hack_word): # 文件路径,番号,后缀,要移动至的位置 + filepath_obj = pathlib.Path(filepath) + houzhui = filepath_obj.suffix + try: + targetpath = os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}{houzhui}") + # 任何情况下都不要覆盖,以免遭遇数据源或者引擎错误导致所有文件得到同一个number,逐一 + # 同名覆盖致使全部文件损失且不可追回的最坏情况 + if os.path.exists(targetpath): + raise FileExistsError('File Exists on destination path, we will never overwriting.') + link_mode = config.getInstance().link_mode() + # 如果link_mode 1: 建立软链接 2: 硬链接优先、无法建立硬链接再尝试软链接。 + # 移除原先soft_link=2的功能代码,因默认记录日志,已经可追溯文件来源 + create_softlink = False + if link_mode not in (1, 2): + shutil.move(filepath, targetpath) + elif link_mode == 2: + # 跨卷或跨盘符无法建立硬链接导致异常,回落到建立软链接 + try: + os.link(filepath, targetpath, follow_symlinks=False) + except: + create_softlink = True + if link_mode == 1 or create_softlink: + # 先尝试采用相对路径,以便网络访问时能正确打开视频,失败则可能是因为跨盘符等原因无法支持 + # 相对路径径,改用绝对路径方式尝试建立软链接 + try: + filerelpath = os.path.relpath(filepath, path) + os.symlink(filerelpath, targetpath) + except: + os.symlink(str(filepath_obj.resolve()), targetpath) + return + + except FileExistsError as fee: + print(f'[-]FileExistsError: {fee}') + moveFailedFolder(filepath) + return + except PermissionError: + print('[-]Error! Please run as administrator!') + return + except OSError as oserr: + print(f'[-]OS Error errno {oserr.errno}') + return + + +def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, + hack_word): # 文件路径,番号,后缀,要移动至的位置 + if multi_part == 1: + number += part # 这时number会被附加上CD1后缀 + filepath_obj = pathlib.Path(filepath) + houzhui = filepath_obj.suffix + targetpath = os.path.join(path, f"{number}{part}{leak_word}{c_word}{hack_word}{houzhui}") + if os.path.exists(targetpath): + raise FileExistsError('File Exists on destination path, we will never overwriting.') + try: + link_mode = config.getInstance().link_mode() + create_softlink = False + if link_mode not in (1, 2): + shutil.move(filepath, targetpath) + print("[!]Move => ", path) + return + elif link_mode == 2: + try: + os.link(filepath, targetpath, follow_symlinks=False) + except: + create_softlink = True + if link_mode == 1 or create_softlink: + try: + filerelpath = os.path.relpath(filepath, path) + os.symlink(filerelpath, targetpath) + except: + os.symlink(str(filepath_obj.resolve()), targetpath) + print("[!]Link => ", path) + except FileExistsError as fee: + print(f'[-]FileExistsError: {fee}') + except PermissionError: + print('[-]Error! Please run as administrator!') + except OSError as oserr: + print(f'[-]OS Error errno {oserr.errno}') + + +def linkImage(path, number, part, leak_word, c_word, hack_word, ext): + """ + 首先尝试为图片建立符合Jellyfin封面图文件名规则的硬连接以节省磁盘空间 + 如果目标目录无法建立硬链接则将图片复制一份成为常规文件 + 常规文件日期已经存在时,若修改日期比源文件更旧,则将被新的覆盖,否则忽略 + """ + if not all(len(v) for v in (path, number, part, ext)): + return + covers = ("-fanart", "-poster", "-thumb") + normal_prefix = f"{number}{leak_word}{c_word}{hack_word}" + multi_prefix = f"{number}{part}{leak_word}{c_word}{hack_word}" + normal_pathes = (Path(path) / f"{normal_prefix}{c}{ext}" for c in covers) + multi_pathes = (Path(path) / f"{multi_prefix}{c}{ext}" for c in covers) + for normal_path, multi_path in zip(normal_pathes, multi_pathes): + if not normal_path.is_file(): + continue + mkLink = False + if not multi_path.exists(): + mkLink = True + elif multi_path.is_file(): + if multi_path.stat().st_nlink > 1: + continue + elif normal_path.stat().st_mtime <= multi_path.stat().st_mtime: + continue + mkLink = True + multi_path.unlink(missing_ok=True) + if not mkLink: + continue + try: + os.link(str(normal_path), str(multi_path), follow_symlinks=False) + except: + shutil.copyfile(str(normal_path), str(multi_path)) + + +def debug_print(data: json): + try: + print("[+] ------- DEBUG INFO -------") + for i, v in data.items(): + if i == 'outline': + print('[+] -', "%-19s" % i, ':', len(v), 'characters') + continue + if i == 'actor_photo' or i == 'year': + continue + if i == 'extrafanart': + print('[+] -', "%-19s" % i, ':', len(v), 'links') + continue + print(f'[+] - {i:<{cn_space(i, 19)}} : {v}') + + print("[+] ------- DEBUG INFO -------") + except: + pass + + +def core_main_no_net_op(movie_path, number): + conf = config.getInstance() + part = '' + leak_word = '' + leak = False + c_word = '' + cn_sub = False + hack = False + hack_word = '' + _4k = False + iso = False + imagecut = 1 + multi = False + part = '' + path = str(Path(movie_path).parent) + + if re.search('[-_]CD\d+', movie_path, re.IGNORECASE): + part = re.findall('[-_]CD\d+', movie_path, re.IGNORECASE)[0].upper() + multi = True + if re.search(r'[-_]C(\.\w+$|-\w+)|\d+ch(\.\w+$|-\w+)', movie_path, + re.I) or '中文' in movie_path or '字幕' in movie_path or ".chs" in movie_path or '.cht' in movie_path: + cn_sub = True + c_word = '-C' # 中文字幕影片后缀 + uncensored = True if is_uncensored(number) else 0 + + if '4k'.upper() in str(movie_path).upper() or '4k' in movie_path: + _4k = True + + if '.iso'.upper() in str(movie_path).upper() or '.iso' in movie_path: + iso = True + # try: + + # props = get_video_properties(movie_path) # 判断是否为4K视频 + # if props['width'] >= 4096 or props['height'] >= 2160: + # _4k = True + # except: + # pass + prestr = f"{number}{leak_word}{c_word}{hack_word}" + + full_nfo = Path(path) / f"{prestr}{part}.nfo" + if full_nfo.is_file(): + try: + nfo_xml = etree.parse(full_nfo) + nfo_fanart_path = nfo_xml.xpath('//fanart/text()')[0] + ext = Path(nfo_fanart_path).suffix + except: + return + else: + return + fanart_path = f"fanart{ext}" + poster_path = f"poster{ext}" + thumb_path = f"thumb{ext}" + if config.getInstance().image_naming_with_number(): + fanart_path = f"{prestr}-fanart{ext}" + poster_path = f"{prestr}-poster{ext}" + thumb_path = f"{prestr}-thumb{ext}" + full_fanart_path = os.path.join(path, fanart_path) + full_poster_path = os.path.join(path, poster_path) + full_thumb_path = os.path.join(path, thumb_path) + + if not all(os.path.isfile(f) for f in (full_fanart_path, full_thumb_path)): + return + + cutImage(imagecut, path, fanart_path, poster_path, bool(conf.face_uncensored_only() and not uncensored)) + if conf.is_watermark(): + add_mark(full_poster_path, full_thumb_path, cn_sub, leak, uncensored, hack, _4k, iso) + + if multi and conf.jellyfin_multi_part_fanart(): + linkImage(path, number, part, leak_word, c_word, hack_word, ext) + + +def move_subtitles(filepath, path, multi_part, number, part, leak_word, c_word, hack_word) -> bool: + filepath_obj = pathlib.Path(filepath) + link_mode = config.getInstance().link_mode() + sub_res = config.getInstance().sub_rule() + result = False + for subfile in filepath_obj.parent.glob('**/*'): + if subfile.is_file() and subfile.suffix.lower() in sub_res: + if multi_part and part.lower() not in subfile.name.lower(): + continue + if filepath_obj.stem.split('.')[0].lower() != subfile.stem.split('.')[0].lower(): + continue + sub_targetpath = Path(path) / f"{number}{leak_word}{c_word}{hack_word}{''.join(subfile.suffixes)}" + if link_mode not in (1, 2): + shutil.move(str(subfile), str(sub_targetpath)) + print(f"[+]Sub Moved! {sub_targetpath.name}") + result = True + else: + shutil.copyfile(str(subfile), str(sub_targetpath)) + print(f"[+]Sub Copied! {sub_targetpath.name}") + result = True + if result: + break + return result + + +def core_main(movie_path, number_th, oCC, specified_source=None, specified_url=None): + conf = config.getInstance() + # =======================================================================初始化所需变量 + multi_part = False + part = '' + leak_word = '' + c_word = '' + cn_sub = False + liuchu = False + hack = False + hack_word = '' + _4k = False + iso = False + + # 下面被注释的变量不需要 + # rootpath = os.getcwd + number = number_th + json_data = get_data_from_json(number, oCC, specified_source, specified_url) # 定义番号 + + # Return if blank dict returned (data not found) + if not json_data: + moveFailedFolder(movie_path) + return + + if json_data["number"] != number: + # fix issue #119 + # the root cause is we normalize the search id + # print_files() will use the normalized id from website, + # but paste_file_to_folder() still use the input raw search id + # so the solution is: use the normalized search id + number = json_data["number"] + imagecut = json_data.get('imagecut') + tag = json_data.get('tag') + # =======================================================================判断-C,-CD后缀 + if re.search('[-_]CD\d+', movie_path, re.IGNORECASE): + multi_part = True + part = re.findall('[-_]CD\d+', movie_path, re.IGNORECASE)[0].upper() + if re.search(r'[-_]C(\.\w+$|-\w+)|\d+ch(\.\w+$|-\w+)', movie_path, + re.I) or '中文' in movie_path or '字幕' in movie_path: + cn_sub = True + c_word = '-C' # 中文字幕影片后缀 + + if re.search(r'[-_]UC(\.\w+$|-\w+)', movie_path, + re.I): + cn_sub = True + hack_word = '-UC' # + hack = True + + if re.search(r'[-_]U(\.\w+$|-\w+)', movie_path, + re.I):# + hack = True + hack_word = '-U' + uncensored = int(unce) if isinstance(unce, bool) else int(is_uncensored(number)) + + if '4k'.upper() in str(movie_path).upper() or '4k' in movie_path: + _4k = True + + if '.iso'.upper() in str(movie_path).upper() or '.iso' in movie_path: + iso = True + + # 判断是否4k + if '4K' in tag: + tag.remove('4K') # 从tag中移除'4K' + + + # try: + # props = get_video_properties(movie_path) # 判断是否为4K视频 + # if props['width'] >= 4096 or props['height'] >= 2160: + # _4k = True + # except: + # pass + + # 调试模式检测 + if conf.debug(): + debug_print(json_data) + + # 创建文件夹 + # path = create_folder(rootpath + '/' + conf.success_folder(), json_data.get('location_rule'), json_data) + + cover = json_data.get('cover') + ext = image_ext(cover) + + fanart_path = f"fanart{ext}" + poster_path = f"poster{ext}" + thumb_path = f"thumb{ext}" + if config.getInstance().image_naming_with_number(): + fanart_path = f"{number}{leak_word}{c_word}{hack_word}-fanart{ext}" + poster_path = f"{number}{leak_word}{c_word}{hack_word}-poster{ext}" + thumb_path = f"{number}{leak_word}{c_word}{hack_word}-thumb{ext}" + + # main_mode + # 1: 刮削模式 / Scraping mode + # 2: 整理模式 / Organizing mode + # 3:不改变路径刮削 + if conf.main_mode() == 1: + # 创建文件夹 + path = create_folder(json_data) + if multi_part == 1: + number += part # 这时number会被附加上CD1后缀 + + # 检查小封面, 如果image cut为3,则下载小封面 + if imagecut == 3: + if 'headers' in json_data: + small_cover_check(path, poster_path, json_data.get('cover_small'), movie_path, json_data) + else: + small_cover_check(path, poster_path, json_data.get('cover_small'), movie_path) + + # creatFolder会返回番号路径 + if 'headers' in json_data: + image_download(cover, fanart_path, thumb_path, path, movie_path, json_data) + else: + image_download(cover, fanart_path, thumb_path, path, movie_path) + + if not multi_part or part.lower() == '-cd1': + try: + # 下载预告片 + if conf.is_trailer() and json_data.get('trailer'): + trailer_download(json_data.get('trailer'), leak_word, c_word, hack_word, number, path, movie_path) + + # 下载剧照 data, path, filepath + if conf.is_extrafanart() and json_data.get('extrafanart'): + if 'headers' in json_data: + extrafanart_download(json_data.get('extrafanart'), path, number, movie_path, json_data) + else: + extrafanart_download(json_data.get('extrafanart'), path, number, movie_path) + + # 下载演员头像 KODI .actors 目录位置 + if conf.download_actor_photo_for_kodi(): + actor_photo_download(json_data.get('actor_photo'), path, number) + except: + pass + + # 裁剪图 + cutImage(imagecut, path, thumb_path, poster_path, bool(conf.face_uncensored_only() and not uncensored)) + + # 兼容Jellyfin封面图文件名规则 + if multi_part and conf.jellyfin_multi_part_fanart(): + linkImage(path, number_th, part, leak_word, c_word, hack_word, ext) + + # 移动电影 + paste_file_to_folder(movie_path, path, multi_part, number, part, leak_word, c_word, hack_word) + + # Move subtitles + move_status = move_subtitles(movie_path, path, multi_part, number, part, leak_word, c_word, hack_word) + if move_status: + cn_sub = True + # 添加水印 + if conf.is_watermark(): + add_mark(os.path.join(path, poster_path), os.path.join(path, thumb_path), cn_sub, leak, uncensored, + hack, _4k, iso) + + # 最后输出.nfo元数据文件,以完成.nfo文件创建作为任务成功标志 + print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, movie_path, tag, + json_data.get('actor_list'), liuchu, uncensored, hack, hack_word + , _4k, fanart_path, poster_path, thumb_path, iso) + + elif conf.main_mode() == 2: + # 创建文件夹 + path = create_folder(json_data) + # 移动文件 + paste_file_to_folder_mode2(movie_path, path, multi_part, number, part, leak_word, c_word, hack_word) + + # Move subtitles + move_subtitles(movie_path, path, multi_part, number, part, leak_word, c_word, hack_word) + + elif conf.main_mode() == 3: + path = str(Path(movie_path).parent) + if multi_part == 1: + number += part # 这时number会被附加上CD1后缀 + + # 检查小封面, 如果image cut为3,则下载小封面 + if imagecut == 3: + if 'headers' in json_data: + small_cover_check(path, poster_path, json_data.get('cover_small'), movie_path, json_data) + else: + small_cover_check(path, poster_path, json_data.get('cover_small'), movie_path) + + # creatFolder会返回番号路径 + if 'headers' in json_data: + image_download(cover, fanart_path, thumb_path, path, movie_path, json_data) + else: + image_download(cover, fanart_path, thumb_path, path, movie_path) + + if not multi_part or part.lower() == '-cd1': + try: + # 下载预告片 + if conf.is_trailer() and json_data.get('trailer'): + trailer_download(json_data.get('trailer'), leak_word, c_word, hack_word, number, path, movie_path) + + # 下载剧照 data, path, filepath + if conf.is_extrafanart() and json_data.get('extrafanart'): + if 'headers' in json_data: + extrafanart_download(json_data.get('extrafanart'), path, number, movie_path, json_data) + else: + extrafanart_download(json_data.get('extrafanart'), path, number, movie_path) + + # 下载演员头像 KODI .actors 目录位置 + if conf.download_actor_photo_for_kodi(): + actor_photo_download(json_data.get('actor_photo'), path, number) + except: + pass + + # 裁剪图 + cutImage(imagecut, path, fanart_path, poster_path, bool(conf.face_uncensored_only() and not uncensored)) + + # 添加水印 + if conf.is_watermark(): + add_mark(os.path.join(path, poster_path), os.path.join(path, fanart_path), cn_sub, leak, uncensored, hack, + _4k, iso) + + # 兼容Jellyfin封面图文件名规则 + if multi_part and conf.jellyfin_multi_part_fanart(): + linkImage(path, number_th, part, leak_word, c_word, hack_word, ext) + + # 最后输出.nfo元数据文件,以完成.nfo文件创建作为任务成功标志 + print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, movie_path, + tag, json_data.get('actor_list'), liuchu, uncensored, hack, hack_word, _4k, fanart_path, + poster_path, + thumb_path, iso) diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..b0c5abf --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,16 @@ +FROM python:slim +RUN sed -i 's/deb.debian.org/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list \ + && sed -i 's/security.debian.org/mirrors.tuna.tsinghua.edu.cn/g' /etc/apt/sources.list +RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pip -U \ + && pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + +RUN apt-get update \ + && apt-get install -y wget ca-certificates \ + && wget -O - 'https://github.com/yoshiko2/Movie_Data_Capture/archive/master.tar.gz' | tar xz \ + && mv Movie_Data_Capture-master /mdc \ + && cd /mdc \ + && ( pip install --no-cache-dir -r requirements.txt || true ) \ + && pip install --no-cache-dir requests lxml Beautifulsoup4 pillow \ + && apt-get purge -y wget + +WORKDIR /mdc diff --git a/docker/config.ini b/docker/config.ini new file mode 100644 index 0000000..1038a5e --- /dev/null +++ b/docker/config.ini @@ -0,0 +1,27 @@ +[common] +main_mode=1 +failed_output_folder=data/failure_output +success_output_folder=data/organized +link_mode=0 + +[proxy] +proxy= +timeout=10 +retry=3 + +[Name_Rule] +location_rule=actor+'/'+title +naming_rule=number+'-'+title + +[update] +update_check=0 + +[escape] +literals=\()/ +folders=data/failure_output,data/organized + +[debug_mode] +switch=0 + +[media] +media_warehouse=plex diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml new file mode 100644 index 0000000..82b78b9 --- /dev/null +++ b/docker/docker-compose.yaml @@ -0,0 +1,13 @@ +version: "2.2" +services: + jav: + user: "${MDCUID}:${MDCGID}" + image: jav:local + build: . + volumes: + - ./config.ini:/mdc/config.ini + - ${MDC_PATH}:/mdc/data + command: + - python + - /mdc/Movie_Data_Capture.py + - -a diff --git a/number_parser.py b/number_parser.py new file mode 100755 index 0000000..ec6e6bf --- /dev/null +++ b/number_parser.py @@ -0,0 +1,229 @@ +import os +import re +import sys +import config +import typing + +G_spat = re.compile( + "^\w+\.(cc|com|net|me|club|jp|tv|xyz|biz|wiki|info|tw|us|de)@|^22-sht\.me|" + "^(fhd|hd|sd|1080p|720p|4K)(-|_)|" + "(-|_)(fhd|hd|sd|1080p|720p|4K|x264|x265|uncensored|hack|leak)", + re.IGNORECASE) + + +def get_number(debug: bool, file_path: str) -> str: + """ + 从文件路径中提取号码 from number_parser import get_number + """ + filepath = os.path.basename(file_path) + # debug True 和 False 两块代码块合并,原因是此模块及函数只涉及字符串计算,没有IO操作,debug on时输出导致异常信息即可 + try: + # 先对自定义正则进行匹配 + if config.getInstance().number_regexs().split().__len__() > 0: + for regex in config.getInstance().number_regexs().split(): + try: + if re.search(regex, filepath): + return re.search(regex, filepath).group() + except Exception as e: + print(f'[-]custom regex exception: {e} [{regex}]') + + file_number = get_number_by_dict(filepath) + if file_number: + return file_number + elif '字幕组' in filepath or 'SUB' in filepath.upper() or re.match(r'[\u30a0-\u30ff]+', filepath): + filepath = G_spat.sub("", filepath) + filepath = re.sub("\[.*?\]","",filepath) + filepath = filepath.replace(".chs", "").replace(".cht", "") + file_number = str(re.findall(r'(.+?)\.', filepath)).strip(" [']") + return file_number + elif '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号 + filepath = G_spat.sub("", filepath) + filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间 + lower_check = filename.lower() + if 'fc2' in lower_check: + filename = lower_check.replace('--', '-').replace('_', '-').upper() + filename = re.sub("[-_]cd\d{1,2}", "", filename, flags=re.IGNORECASE) + if not re.search("-|_", filename): # 去掉-CD1之后再无-的情况,例如n1012-CD1.wmv + return str(re.search(r'\w+', filename[:filename.find('.')], re.A).group()) + file_number = os.path.splitext(filename) + filename = re.search(r'[\w\-_]+', filename, re.A) + if filename: + file_number = str(filename.group()) + else: + file_number = file_number[0] + + new_file_number = file_number + if re.search("-c", file_number, flags=re.IGNORECASE): + new_file_number = re.sub("(-|_)c$", "", file_number, flags=re.IGNORECASE) + elif re.search("-u$", file_number, flags=re.IGNORECASE): + new_file_number = re.sub("(-|_)u$", "", file_number, flags=re.IGNORECASE) + elif re.search("-uc$", file_number, flags=re.IGNORECASE): + new_file_number = re.sub("(-|_)uc$", "", file_number, flags=re.IGNORECASE) + elif re.search("\d+ch$", file_number, flags=re.I): + new_file_number = file_number[:-2] + + return new_file_number.upper() + else: # 提取不含减号-的番号,FANZA CID + # 欧美番号匹配规则 + oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath) + if oumei: + return oumei.group() + try: + return str( + re.findall(r'(.+?)\.', + str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip( + "['']").replace('_', '-') + except: + return str(re.search(r'(.+?)\.', filepath)[0]) + except Exception as e: + if debug: + print(f'[-]Number Parser exception: {e} [{file_path}]') + return None + +G_TAKE_NUM_RULES = { + 'tokyo.*hot': lambda x: str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.I).group()), + 'carib': lambda x: str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('_', '-'), + '1pon|mura|paco': lambda x: str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('-', '_'), + '10mu': lambda x: str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'), + 'x-art': lambda x: str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()), + 'xxx-av': lambda x: ''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]), + 'heydouga': lambda x: 'heydouga-' + '-'.join(re.findall(r'(\d{4})[\-_](\d{3,4})[^\d]*', x, re.I)[0]), + 'heyzo': lambda x: 'HEYZO-' + re.findall(r'heyzo[^\d]*(\d{4})', x, re.I)[0], + 'mdbk': lambda x: str(re.search(r'mdbk(-|_)(\d{4})', x, re.I).group()), + 'mdtm': lambda x: str(re.search(r'mdtm(-|_)(\d{4})', x, re.I).group()), + 'caribpr': lambda x: str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('_', '-'), +} + + +def get_number_by_dict(filename: str) -> typing.Optional[str]: + try: + for k, v in G_TAKE_NUM_RULES.items(): + if re.search(k, filename, re.I): + return v(filename) + except: + pass + return None + + +class Cache_uncensored_conf: + prefix = None + + def is_empty(self): + return bool(self.prefix is None) + + def set(self, v: list): + if not v or not len(v) or not len(v[0]): + raise ValueError('input prefix list empty or None') + s = v[0] + if len(v) > 1: + for i in v[1:]: + s += f"|{i}.+" + self.prefix = re.compile(s, re.I) + + def check(self, number): + if self.prefix is None: + raise ValueError('No init re compile') + return self.prefix.match(number) + + +G_cache_uncensored_conf = Cache_uncensored_conf() + + +# ========================================================================是否为无码 +def is_uncensored(number) -> bool: + if re.match( + r'[\d-]{4,}|\d{6}_\d{2,3}|(cz|gedo|k|n|red-|se)\d{2,4}|heyzo.+|xxx-av-.+|heydouga-.+|x-art\.\d{2}\.\d{2}\.\d{2}', + number, + re.I + ): + return True + if G_cache_uncensored_conf.is_empty(): + G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(',')) + return bool(G_cache_uncensored_conf.check(number)) + + +if __name__ == "__main__": + # import doctest + # doctest.testmod(raise_on_error=True) + test_use_cases = ( + "ABC-123-C.mp4", + ) + + + def evprint(evstr): + code = compile(evstr, "", "eval") + print("{1:>20} # '{0}'".format(evstr[18:-2], eval(code))) + + + for t in test_use_cases: + evprint(f'get_number(True, "{t}")') + + if len(sys.argv) <= 1 or not re.search('^[A-Z]:?', sys.argv[1], re.IGNORECASE): + sys.exit(0) + + # 使用Everything的ES命令行工具搜集全盘视频文件名作为用例测试number数据,参数为盘符 A .. Z 或带盘符路径 + # https://www.voidtools.com/support/everything/command_line_interface/ + # ES命令行工具需要Everything文件搜索引擎处于运行状态,es.exe单个执行文件需放入PATH路径中。 + # Everything是免费软件 + # 示例: + # python.exe .\number_parser.py ALL # 从所有磁盘搜索视频 + # python.exe .\number_parser.py D # 从D盘搜索 + # python.exe .\number_parser.py D: # 同上 + # python.exe .\number_parser.py D:\download\JAVs # 搜索D盘的\download\JAVs目录,路径必须带盘符 + # ================== + # Linux/WSL1|2 使用mlocate(Ubuntu/Debian)或plocate(Debian sid)搜集全盘视频文件名作为测试用例number数据 + # 需安装'sudo apt install mlocate或plocate'并首次运行sudo updatedb建立全盘索引 + # MAC OS X 使用findutils的glocate,需安装'sudo brew install findutils'并首次运行sudo gupdatedb建立全盘索引 + # 示例: + # python3 ./number_parser.py ALL + import subprocess + + ES_search_path = "ALL disks" + if sys.argv[1] == "ALL": + if sys.platform == "win32": + # ES_prog_path = 'C:/greensoft/es/es.exe' + ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内 + ES_cmdline = f'{ES_prog_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;flv;ts;webm;iso;mpg;m4v' + out_bytes = subprocess.check_output(ES_cmdline.split(' ')) + out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失 + out_list = out_text.splitlines() + elif sys.platform in ("linux", "darwin"): + ES_prog_path = 'locate' if sys.platform == 'linux' else 'glocate' + ES_cmdline = r"{} -b -i --regex '\.mp4$|\.avi$|\.rmvb$|\.wmv$|\.mov$|\.mkv$|\.webm$|\.iso$|\.mpg$|\.m4v$'".format( + ES_prog_path) + out_bytes = subprocess.check_output(ES_cmdline.split(' ')) + out_text = out_bytes.decode('utf-8') + out_list = [os.path.basename(line) for line in out_text.splitlines()] + else: + print('[-]Unsupported platform! Please run on OS Windows/Linux/MacOSX. Exit.') + sys.exit(1) + else: # Windows single disk + if sys.platform != "win32": + print('[!]Usage: python3 ./number_parser.py ALL') + sys.exit(0) + # ES_prog_path = 'C:/greensoft/es/es.exe' + ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内 + if os.path.isdir(sys.argv[1]): + ES_search_path = sys.argv[1] + else: + ES_search_path = sys.argv[1][0] + ':/' + if not os.path.isdir(ES_search_path): + ES_search_path = 'C:/' + ES_search_path = os.path.normcase(ES_search_path) + ES_cmdline = f'{ES_prog_path} -path {ES_search_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;webm;iso;mpg;m4v' + out_bytes = subprocess.check_output(ES_cmdline.split(' ')) + out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失 + out_list = out_text.splitlines() + print(f'\n[!]{ES_prog_path} is searching {ES_search_path} for movies as number parser test cases...') + print(f'[+]Find {len(out_list)} Movies.') + for filename in out_list: + try: + n = get_number(True, filename) + if n: + print(' [{0}] {2}# {1}'.format(n, filename, '#无码' if is_uncensored(n) else '')) + else: + print(f'[-]Number return None. # {filename}') + except Exception as e: + print(f'[-]Number Parser exception: {e} [{filename}]') + + sys.exit(0) diff --git a/py_to_exe.ps1 b/py_to_exe.ps1 new file mode 100644 index 0000000..14d3284 --- /dev/null +++ b/py_to_exe.ps1 @@ -0,0 +1,26 @@ +# If you can't run this script, please execute the following command in PowerShell. +# Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force + +$CLOUDSCRAPER_PATH = $( python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1 ) +$OPENCC_PATH = $( python -c 'import opencc as _; print(_.__path__[0])' | select -Last 1 ) +$FACE_RECOGNITION_MODELS = $( python -c 'import face_recognition_models as _; print(_.__path__[0])' | select -Last 1 ) + +mkdir build +mkdir __pycache__ + +pyinstaller --onefile Movie_Data_Capture.py ` + --hidden-import "ImageProcessing.cnn" ` + --python-option u ` + --add-data "$FACE_RECOGNITION_MODELS;face_recognition_models" ` + --add-data "$CLOUDSCRAPER_PATH;cloudscraper" ` + --add-data "$OPENCC_PATH;opencc" ` + --add-data "Img;Img" ` + --add-data "config.ini;." ` + --add-data "scrapinglib;scrapinglib" ` + +rmdir -Recurse -Force build +rmdir -Recurse -Force __pycache__ +rmdir -Recurse -Force Movie_Data_Capture.spec + +echo "[Make]Finish" +pause \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..860ca20 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +requests +dlib-bin +Click +numpy +face-recognition-models +lxml +beautifulsoup4 +pillow==10.0.1 +cloudscraper +pysocks==1.7.1 +urllib3==1.26.18 +certifi +MechanicalSoup +opencc-python-reimplemented diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..c5581df --- /dev/null +++ b/scraper.py @@ -0,0 +1,253 @@ +# build-in lib +import json +import secrets +import typing +from pathlib import Path + +# third party lib +import opencc +from lxml import etree +# project wide definitions +import config +from ADC_function import (translate, + load_cookies, + file_modification_days, + delete_all_elements_in_str, + delete_all_elements_in_list + ) +from scrapinglib.api import search + + +def get_data_from_json( + file_number: str, + open_cc: opencc.OpenCC, + specified_source: str, specified_url: str) -> typing.Optional[dict]: + """ + iterate through all services and fetch the data 从网站上查询片名解析JSON返回元数据 + :param file_number: 影片名称 + :param open_cc: 简繁转换器 + :param specified_source: 指定的媒体数据源 + :param specified_url: 指定的数据查询地址, 目前未使用 + :return 给定影片名称的具体信息 + """ + try: + actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml')) + info_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_info.xml')) + except: + actor_mapping_data = etree.fromstring("", etree.HTMLParser()) + info_mapping_data = etree.fromstring("", etree.HTMLParser()) + + conf = config.getInstance() + # default fetch order list, from the beginning to the end + sources = conf.sources() + + # TODO 准备参数 + # - 清理 ADC_function, webcrawler + proxies: dict = None + config_proxy = conf.proxy() + if config_proxy.enable: + proxies = config_proxy.proxies() + + ca_cert = None + if conf.cacert_file(): + ca_cert = conf.cacert_file() + + json_data = search(file_number, sources, proxies=proxies, verify=ca_cert, + morestoryline=conf.is_storyline(), + specifiedSource=specified_source, specifiedUrl=specified_url, + debug = conf.debug()) + # Return if data not found in all sources + if not json_data: + print('[-]Movie Number not found!') + return None + + # 增加number严格判断,避免提交任何number + if str(json_data.get('number')).upper() != file_number.upper(): + try: + if json_data.get('allow_number_change'): + pass + except: + print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number')))) + return None + + # ================================================网站规则添加结束================================================ + + if json_data.get('title') == '': + print('[-]Movie Number or Title not found!') + return None + + title = json_data.get('title') + actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表 + actor_list = [actor.strip() for actor in actor_list] # 去除空白 + director = json_data.get('director') + release = json_data.get('release') + number = json_data.get('number') + studio = json_data.get('studio') + source = json_data.get('source') + runtime = json_data.get('runtime') + outline = json_data.get('outline') + label = json_data.get('label') + series = json_data.get('series') + year = json_data.get('year') + + if json_data.get('cover_small'): + cover_small = json_data.get('cover_small') + else: + cover_small = '' + + if json_data.get('trailer'): + trailer = json_data.get('trailer') + else: + trailer = '' + + if json_data.get('extrafanart'): + extrafanart = json_data.get('extrafanart') + else: + extrafanart = '' + + imagecut = json_data.get('imagecut') + tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @ + while 'XXXX' in tag: + tag.remove('XXXX') + while 'xxx' in tag: + tag.remove('xxx') + actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '') + + # if imagecut == '3': + # DownloadFileWithFilename() + + # ====================处理异常字符====================== #\/:*?"<>| + actor = special_characters_replacement(actor) + actor_list = [special_characters_replacement(a) for a in actor_list] + title = special_characters_replacement(title) + label = special_characters_replacement(label) + outline = special_characters_replacement(outline) + series = special_characters_replacement(series) + studio = special_characters_replacement(studio) + director = special_characters_replacement(director) + tag = [special_characters_replacement(t) for t in tag] + release = release.replace('/', '-') + tmpArr = cover_small.split(',') + if len(tmpArr) > 0: + cover_small = tmpArr[0].strip('\"').strip('\'') + # ====================处理异常字符 END================== #\/:*?"<>| + + # 处理大写 + if conf.number_uppercase(): + json_data['number'] = number.upper() + + # 返回处理后的json_data + json_data['title'] = title + json_data['original_title'] = title + json_data['actor'] = actor + json_data['release'] = release + json_data['cover_small'] = cover_small + json_data['tag'] = tag + json_data['year'] = year + json_data['actor_list'] = actor_list + json_data['trailer'] = trailer + json_data['extrafanart'] = extrafanart + json_data['label'] = label + json_data['outline'] = outline + json_data['series'] = series + json_data['studio'] = studio + json_data['director'] = director + + if conf.is_translate(): + translate_values = conf.translate_values().split(",") + for translate_value in translate_values: + if json_data[translate_value] == "": + continue + if conf.get_translate_engine() == "azure": + t = translate( + json_data[translate_value], + target_language="zh-Hans", + engine=conf.get_translate_engine(), + key=conf.get_translate_key(), + ) + else: + if len(json_data[translate_value]): + if type(json_data[translate_value]) == str: + json_data[translate_value] = special_characters_replacement(json_data[translate_value]) + json_data[translate_value] = translate(json_data[translate_value]) + else: + for i in range(len(json_data[translate_value])): + json_data[translate_value][i] = special_characters_replacement( + json_data[translate_value][i]) + list_in_str = ",".join(json_data[translate_value]) + json_data[translate_value] = translate(list_in_str).split(',') + + if open_cc: + cc_vars = conf.cc_convert_vars().split(",") + ccm = conf.cc_convert_mode() + + def convert_list(mapping_data, language, vars): + total = [] + for i in vars: + if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")) != 0: + i = mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")[0] + total.append(i) + return total + + def convert(mapping_data, language, vars): + if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0: + return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0] + else: + raise IndexError('keyword not found') + + for cc in cc_vars: + if json_data[cc] == "" or len(json_data[cc]) == 0: + continue + try: + if ccm == 1: + json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc]) + json_data[cc] = delete_all_elements_in_str("删除", json_data[cc]) + elif ccm == 2: + json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc]) + json_data[cc] = delete_all_elements_in_str("删除", json_data[cc]) + elif ccm == 3: + json_data[cc] = convert(info_mapping_data, "jp", json_data[cc]) + json_data[cc] = delete_all_elements_in_str("删除", json_data[cc]) + except IndexError: + json_data[cc] = open_cc.convert(json_data[cc]) + except: + pass + + naming_rule = "" + original_naming_rule = "" + for i in conf.naming_rule().split("+"): + if i not in json_data: + naming_rule += i.strip("'").strip('"') + original_naming_rule += i.strip("'").strip('"') + else: + item = json_data.get(i) + naming_rule += item if type(item) is not list else "&".join(item) + # PATCH:处理[title]存在翻译的情况,后续NFO文件的original_name只会直接沿用naming_rule,这导致original_name非原始名 + # 理应在翻译处处理 naming_rule和original_naming_rule + if i == 'title': + item = json_data.get('original_title') + original_naming_rule += item if type(item) is not list else "&".join(item) + + json_data['naming_rule'] = naming_rule + json_data['original_naming_rule'] = original_naming_rule + return json_data + + +def special_characters_replacement(text) -> str: + if not isinstance(text, str): + return text + return (text.replace('\\', '∖'). # U+2216 SET MINUS @ Basic Multilingual Plane + replace('/', '∕'). # U+2215 DIVISION SLASH @ Basic Multilingual Plane + replace(':', '꞉'). # U+A789 MODIFIER LETTER COLON @ Latin Extended-D + replace('*', '∗'). # U+2217 ASTERISK OPERATOR @ Basic Multilingual Plane + replace('?', '?'). # U+FF1F FULLWIDTH QUESTION MARK @ Basic Multilingual Plane + replace('"', '"'). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane + replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane + replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane + replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane + replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK + replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK + replace('…', '…'). + replace('&', '&'). + replace("&", '&') + ) diff --git a/scrapinglib/__init__.py b/scrapinglib/__init__.py new file mode 100644 index 0000000..e2144f5 --- /dev/null +++ b/scrapinglib/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +from .api import search, getSupportedSources \ No newline at end of file diff --git a/scrapinglib/api.py b/scrapinglib/api.py new file mode 100644 index 0000000..0947729 --- /dev/null +++ b/scrapinglib/api.py @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- + +import re +import json +from .parser import Parser +import config +import importlib + + +def search(number, sources: str = None, **kwargs): + """ 根据`电影`名搜索信息 + + :param number: number/name depends on type + :param sources: sources string with `,` Eg: `` + :param type: `adult`, `general` + """ + sc = Scraping() + return sc.search(number, sources, **kwargs) + + +def getSupportedSources(): + """ + :param tag: `adult`, `general` + """ + sc = Scraping() + return ','.join(sc.general_full_sources) + + +class Scraping: + """ + """ + adult_full_sources = [] + + general_full_sources = ['tmdb', 'imdb'] + + debug = False + + proxies = None + verify = None + specifiedSource = None + specifiedUrl = None + + dbcookies = None + dbsite = None + # 使用storyline方法进一步获取故事情节 + morestoryline = False + + def search(self, number, sources=None, proxies=None, verify=None, type='adult', + specifiedSource=None, specifiedUrl=None, + dbcookies=None, dbsite=None, morestoryline=False, + debug=False): + self.debug = debug + self.proxies = proxies + self.verify = verify + self.specifiedSource = specifiedSource + self.specifiedUrl = specifiedUrl + self.dbcookies = dbcookies + self.dbsite = dbsite + self.morestoryline = morestoryline + if type == 'adult': + return self.searchAdult(number, sources) + else: + return self.searchGeneral(number, sources) + + def searchGeneral(self, name, sources): + """ 查询电影电视剧 + imdb,tmdb + """ + if self.specifiedSource: + sources = [self.specifiedSource] + else: + sources = self.checkGeneralSources(sources, name) + json_data = {} + for source in sources: + try: + if self.debug: + print('[+]select', source) + try: + module = importlib.import_module('.' + source, 'scrapinglib') + parser_type = getattr(module, source.capitalize()) + parser: Parser = parser_type() + data = parser.scrape(name, self) + if data == 404: + continue + json_data = json.loads(data) + except Exception as e: + if config.getInstance().debug(): + print(e) + # if any service return a valid return, break + if self.get_data_state(json_data): + if self.debug: + print(f"[+]Find movie [{name}] metadata on website '{source}'") + break + except: + continue + + # Return if data not found in all sources + if not json_data or json_data['title'] == "": + return None + + # If actor is anonymous, Fill in Anonymous + if len(json_data['actor']) == 0: + if config.getInstance().anonymous_fill() == True: + if "zh_" in config.getInstance().get_target_language() or "ZH" in config.getInstance().get_target_language(): + json_data['actor'] = "佚名" + else: + json_data['actor'] = "Anonymous" + + return json_data + + def searchAdult(self, number, sources): + if self.specifiedSource: + sources = [self.specifiedSource] + elif type(sources) is list: + pass + else: + sources = self.checkAdultSources(sources, number) + json_data = {} + for source in sources: + try: + if self.debug: + print('[+]select', source) + try: + module = importlib.import_module('.' + source, 'scrapinglib') + parser_type = getattr(module, source.capitalize()) + parser: Parser = parser_type() + data = parser.scrape(number, self) + if data == 404: + continue + json_data = json.loads(data) + except Exception as e: + if config.getInstance().debug(): + print(e) + # json_data = self.func_mapping[source](number, self) + # if any service return a valid return, break + if self.get_data_state(json_data): + if self.debug: + print(f"[+]Find movie [{number}] metadata on website '{source}'") + break + except: + continue + + # Return if data not found in all sources + if not json_data or json_data['title'] == "": + return None + + # If actor is anonymous, Fill in Anonymous + if len(json_data['actor']) == 0: + if config.getInstance().anonymous_fill() == True: + if "zh_" in config.getInstance().get_target_language() or "ZH" in config.getInstance().get_target_language(): + json_data['actor'] = "佚名" + else: + json_data['actor'] = "Anonymous" + + return json_data + + def checkGeneralSources(self, c_sources, name): + if not c_sources: + sources = self.general_full_sources + else: + sources = c_sources.split(',') + + # check sources in func_mapping + todel = [] + for s in sources: + if not s in self.general_full_sources: + print('[!] Source Not Exist : ' + s) + todel.append(s) + for d in todel: + print('[!] Remove Source : ' + s) + sources.remove(d) + return sources + + def checkAdultSources(self, c_sources, file_number): + if not c_sources: + sources = self.adult_full_sources + else: + sources = c_sources.split(',') + + def insert(sources, source): + if source in sources: + sources.insert(0, sources.pop(sources.index(source))) + return sources + + # check sources in func_mapping + todel = [] + for s in sources: + if not s in self.adult_full_sources and config.getInstance().debug(): + print('[!] Source Not Exist : ' + s) + todel.append(s) + for d in todel: + if config.getInstance().debug(): + print('[!] Remove Source : ' + d) + sources.remove(d) + return sources + + def get_data_state(self, data: dict) -> bool: # 元数据获取失败检测 + if "title" not in data or "number" not in data: + return False + if data["title"] is None or data["title"] == "" or data["title"] == "null": + return False + if data["number"] is None or data["number"] == "" or data["number"] == "null": + return False + if (data["cover"] is None or data["cover"] == "" or data["cover"] == "null") \ + and (data["cover_small"] is None or data["cover_small"] == "" or + data["cover_small"] == "null"): + return False + return True \ No newline at end of file diff --git a/scrapinglib/httprequest.py b/scrapinglib/httprequest.py new file mode 100644 index 0000000..ca3c36b --- /dev/null +++ b/scrapinglib/httprequest.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- + +import mechanicalsoup +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +from cloudscraper import create_scraper + +import config + +G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36' +G_DEFAULT_TIMEOUT = 10 + + +def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: str = None, encoding: str = None, + retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): + """ + 网页请求核心函数 + + 是否使用代理应由上层处理 + """ + errors = "" + headers = {"User-Agent": ua or G_USER_AGENT} + if extra_headers != None: + headers.update(extra_headers) + for i in range(retry): + try: + result = requests.get(url, headers=headers, timeout=timeout, proxies=proxies, + verify=verify, cookies=cookies) + if return_type == "object": + return result + elif return_type == "content": + return result.content + else: + result.encoding = encoding or result.apparent_encoding + return result.text + except Exception as e: + if config.getInstance().debug(): + print(f"[-]Connect: {url} retry {i + 1}/{retry}") + errors = str(e) + if config.getInstance().debug(): + if "getaddrinfo failed" in errors: + print("[-]Connect Failed! Please Check your proxy config") + print("[-]" + errors) + else: + print("[-]" + errors) + print('[-]Connect Failed! Please check your Proxy or Network!') + raise Exception('Connect Failed') + + +def post(url: str, data: dict=None, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None, + retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None): + """ + 是否使用代理应由上层处理 + """ + errors = "" + headers = {"User-Agent": ua or G_USER_AGENT} + + for i in range(retry): + try: + result = requests.post(url, data=data, files=files, headers=headers, timeout=timeout, proxies=proxies, + verify=verify, cookies=cookies) + if return_type == "object": + return result + elif return_type == "content": + return result.content + else: + result.encoding = encoding or result.apparent_encoding + return result + except Exception as e: + if config.getInstance().debug(): + print(f"[-]Connect: {url} retry {i + 1}/{retry}") + errors = str(e) + if config.getInstance().debug(): + if "getaddrinfo failed" in errors: + print("[-]Connect Failed! Please Check your proxy config") + print("[-]" + errors) + else: + print("[-]" + errors) + print('[-]Connect Failed! Please check your Proxy or Network!') + raise Exception('Connect Failed') + + +class TimeoutHTTPAdapter(HTTPAdapter): + def __init__(self, *args, **kwargs): + self.timeout = G_DEFAULT_TIMEOUT + if "timeout" in kwargs: + self.timeout = kwargs["timeout"] + del kwargs["timeout"] + super().__init__(*args, **kwargs) + + def send(self, request, **kwargs): + timeout = kwargs.get("timeout") + if timeout is None: + kwargs["timeout"] = self.timeout + return super().send(request, **kwargs) + + +def request_session(cookies=None, ua: str=None, retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None): + """ + keep-alive + """ + session = requests.Session() + retries = Retry(total=retry, connect=retry, backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504]) + session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout)) + session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout)) + if isinstance(cookies, dict) and len(cookies): + requests.utils.add_dict_to_cookiejar(session.cookies, cookies) + if verify: + session.verify = verify + if proxies: + session.proxies = proxies + session.headers = {"User-Agent": ua or G_USER_AGENT} + return session + + +# storyline xcity only +def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, + return_type: str = None, encoding: str = None, + retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): + session = requests.Session() + if isinstance(cookies, dict) and len(cookies): + requests.utils.add_dict_to_cookiejar(session.cookies, cookies) + retries = Retry(total=retry, connect=retry, backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504]) + session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout)) + session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout)) + if verify: + session.verify = verify + if proxies: + session.proxies = proxies + try: + browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=session) + result = browser.open(url) + if not result.ok: + return None + form = browser.select_form() if form_select is None else browser.select_form(form_select) + if isinstance(fields, dict): + for k, v in fields.items(): + browser[k] = v + response = browser.submit_selected() + + if return_type == "object": + return response + elif return_type == "content": + return response.content + elif return_type == "browser": + return response, browser + else: + result.encoding = encoding or "utf-8" + return response.text + except requests.exceptions.ProxyError: + print("[-]get_html_by_form() Proxy error! Please check your Proxy") + except Exception as e: + print(f'[-]get_html_by_form() Failed! {e}') + return None + +# storyline javdb only +def get_html_by_scraper(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None, + encoding: str = None, retry: int = 3, proxies=None, timeout: int = G_DEFAULT_TIMEOUT, verify=None): + session = create_scraper(browser={'custom': ua or G_USER_AGENT, }) + if isinstance(cookies, dict) and len(cookies): + requests.utils.add_dict_to_cookiejar(session.cookies, cookies) + retries = Retry(total=retry, connect=retry, backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504]) + session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout)) + session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout)) + if verify: + session.verify = verify + if proxies: + session.proxies = proxies + try: + if isinstance(url, str) and len(url): + result = session.get(str(url)) + else: # 空url参数直接返回可重用scraper对象,无需设置return_type + return session + if not result.ok: + return None + if return_type == "object": + return result + elif return_type == "content": + return result.content + elif return_type == "scraper": + return result, session + else: + result.encoding = encoding or "utf-8" + return result.text + except requests.exceptions.ProxyError: + print("[-]get_html_by_scraper() Proxy error! Please check your Proxy") + except Exception as e: + print(f"[-]get_html_by_scraper() failed. {e}") + return None diff --git a/scrapinglib/imdb.py b/scrapinglib/imdb.py new file mode 100644 index 0000000..7aab483 --- /dev/null +++ b/scrapinglib/imdb.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + + +from .parser import Parser + + +class Imdb(Parser): + source = 'imdb' + imagecut = 0 + + expr_title = '//h1[@data-testid="hero-title-block__title"]/text()' + expr_release = '//a[contains(text(),"Release date")]/following-sibling::div[1]/ul/li/a/text()' + expr_cover = '//head/meta[@property="og:image"]/@content' + expr_outline = '//head/meta[@property="og:description"]/@content' + expr_actor = '//h3[contains(text(),"Top cast")]/../../../following-sibling::div[1]/div[2]/div/div/a/text()' + expr_tags = '//div[@data-testid="genres"]/div[2]/a/ul/li/text()' + + def queryNumberUrl(self, number): + """ + TODO 区分 ID 与 名称 + """ + id = number + movieUrl = "https://www.imdb.com/title/" + id + return movieUrl diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py new file mode 100644 index 0000000..d24ea3d --- /dev/null +++ b/scrapinglib/parser.py @@ -0,0 +1,323 @@ +# -*- coding: utf-8 -*- + +import json +import re +from lxml import etree, html + +import config +from . import httprequest +from .utils import getTreeElement, getTreeAll + +class Parser: + """ 基础刮削类 + """ + source = 'base' + # xpath expr + expr_number = '' + expr_title = '' + expr_studio = '' + expr_studio2 = '' + expr_runtime = '' + expr_runtime2 = '' + expr_release = '' + expr_outline = '' + expr_director = '' + expr_actor = '' + expr_tags = '' + expr_label = '' + expr_label2 = '' + expr_series = '' + expr_series2 = '' + expr_cover = '' + expr_cover2 = '' + expr_smallcover = '' + expr_extrafanart = '' + expr_trailer = '' + expr_actorphoto = '' + expr_uncensored = '' + expr_userrating = '' + expr_uservotes = '' + + def init(self): + """ 初始化参数 + """ + # 推荐剪切poster封面: + # `0` 复制cover + # `1` 裁剪cover + # `3` 下载小封面 + self.imagecut = 1 + self.uncensored = False + self.allow_number_change = False + # update + self.proxies = None + self.verify = None + self.extraheader = None + self.cookies = None + self.morestoryline = False + self.specifiedUrl = None + self.extraInit() + + def extraInit(self): + """ 自定义初始化内容 + """ + pass + + def scrape(self, number, core: None): + """ 刮削番号 + """ + # 每次调用,初始化参数 + self.init() + self.updateCore(core) + result = self.search(number) + return result + + def search(self, number): + """ 查询 + + 查询主要流程: + 1. 获取 url + 2. 获取详情页面 + 3. 解析 + 4. 返回 result + """ + self.number = number + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = self.queryNumberUrl(number) + if not self.detailurl: + return 404 + htmltree = self.getHtmlTree(self.detailurl) + result = self.dictformat(htmltree) + return result + + def updateCore(self, core): + """ 从`core`内更新参数 + + 针对需要传递的参数: cookies, proxy等 + 子类继承后修改 + """ + if not core: + return + if core.proxies: + self.proxies = core.proxies + if core.verify: + self.verify = core.verify + if core.morestoryline: + self.morestoryline = True + if core.specifiedSource == self.source: + self.specifiedUrl = core.specifiedUrl + + def queryNumberUrl(self, number): + """ 根据号码查询详细信息url + + 需要针对不同站点修改,或者在上层直接获取 + 备份查询页面,预览图可能需要 + """ + url = "http://detailurl.ai/" + number + return url + + def getHtml(self, url, type = None): + """ 访问网页 + """ + resp = httprequest.get(url, cookies=self.cookies, proxies=self.proxies, extra_headers=self.extraheader, verify=self.verify, return_type=type) + if '404 Page Not Found' in resp \ + or '<title>未找到页面' in resp \ + or '404 Not Found' in resp \ + or '<title>404' in resp \ + or '<title>お探しの商品が見つかりません' in resp: + return 404 + return resp + + def getHtmlTree(self, url, type = None): + """ 访问网页,返回`etree` + """ + resp = self.getHtml(url, type) + if resp == 404: + return 404 + ret = etree.fromstring(resp, etree.HTMLParser()) + return ret + + def dictformat(self, htmltree): + try: + dic = { + 'number': self.getNum(htmltree), + 'title': self.getTitle(htmltree), + 'studio': self.getStudio(htmltree), + 'release': self.getRelease(htmltree), + 'year': self.getYear(htmltree), + 'outline': self.getOutline(htmltree), + 'runtime': self.getRuntime(htmltree), + 'director': self.getDirector(htmltree), + 'actor': self.getActors(htmltree), + 'actor_photo': self.getActorPhoto(htmltree), + 'cover': self.getCover(htmltree), + 'cover_small': self.getSmallCover(htmltree), + 'extrafanart': self.getExtrafanart(htmltree), + 'trailer': self.getTrailer(htmltree), + 'tag': self.getTags(htmltree), + 'label': self.getLabel(htmltree), + 'series': self.getSeries(htmltree), + 'userrating': self.getUserRating(htmltree), + 'uservotes': self.getUserVotes(htmltree), + 'uncensored': self.getUncensored(htmltree), + 'website': self.detailurl, + 'source': self.source, + 'imagecut': self.getImagecut(htmltree), + } + dic = self.extradict(dic) + except Exception as e: + if config.getInstance().debug(): + print(e) + dic = {"title": ""} + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, separators=(',', ':')) + return js + + def extradict(self, dic:dict): + """ 额外修改dict + """ + return dic + + def getNum(self, htmltree): + """ 增加 strip 过滤 + """ + return self.getTreeElement(htmltree, self.expr_number) + + def getTitle(self, htmltree): + return self.getTreeElement(htmltree, self.expr_title).strip() + + def getRelease(self, htmltree): + return self.getTreeElement(htmltree, self.expr_release).strip().replace('/','-') + + def getYear(self, htmltree): + """ year基本都是从release中解析的 + """ + try: + release = self.getRelease(htmltree) + return str(re.findall('\d{4}', release)).strip(" ['']") + except: + return release + + def getRuntime(self, htmltree): + return self.getTreeElementbyExprs(htmltree, self.expr_runtime, self.expr_runtime2).strip().rstrip('mi') + + def getOutline(self, htmltree): + return self.getTreeElement(htmltree, self.expr_outline).strip() + + def getDirector(self, htmltree): + return self.getTreeElement(htmltree, self.expr_director).strip() + + def getActors(self, htmltree) -> list: + return self.getTreeAll(htmltree, self.expr_actor) + + def getTags(self, htmltree) -> list: + alls = self.getTreeAll(htmltree, self.expr_tags) + tags = [] + for t in alls: + for tag in t.strip().split(','): + tag = tag.strip() + if tag: + tags.append(tag) + return tags + + def getStudio(self, htmltree): + return self.getTreeElementbyExprs(htmltree, self.expr_studio, self.expr_studio2) + + def getLabel(self, htmltree): + return self.getTreeElementbyExprs(htmltree, self.expr_label, self.expr_label2) + + def getSeries(self, htmltree): + return self.getTreeElementbyExprs(htmltree, self.expr_series, self.expr_series2) + + def getCover(self, htmltree): + return self.getTreeElementbyExprs(htmltree, self.expr_cover, self.expr_cover2) + + def getSmallCover(self, htmltree): + return self.getTreeElement(htmltree, self.expr_smallcover) + + def getExtrafanart(self, htmltree) -> list: + return self.getTreeAll(htmltree, self.expr_extrafanart) + + def getTrailer(self, htmltree): + return self.getTreeElement(htmltree, self.expr_trailer) + + def getActorPhoto(self, htmltree) -> dict: + return {} + + def getUncensored(self, htmltree) -> bool: + """ + tag: 無码 無修正 uncensored 无码 + title: 無碼 無修正 uncensored + """ + if self.uncensored: + return self.uncensored + tags = [x.lower() for x in self.getTags(htmltree) if len(x)] + title = self.getTitle(htmltree) + if self.expr_uncensored: + u = self.getTreeAll(htmltree, self.expr_uncensored) + self.uncensored = bool(u) + elif '無码' in tags or '無修正' in tags or 'uncensored' in tags or '无码' in tags: + self.uncensored = True + elif '無码' in title or '無修正' in title or 'uncensored' in title.lower(): + self.uncensored = True + return self.uncensored + + def getImagecut(self, htmltree): + """ 修正 poster不裁剪cover + """ + # if self.imagecut == 1 and self.getUncensored(htmltree): + # self.imagecut = 0 + return self.imagecut + + def getUserRating(self, htmltree): + numstrs = self.getTreeElement(htmltree, self.expr_userrating) + nums = re.findall('[0-9.]+', numstrs) + if len(nums) == 1: + return float(nums[0]) + return '' + + def getUserVotes(self, htmltree): + votestrs = self.getTreeElement(htmltree, self.expr_uservotes) + votes = re.findall('[0-9]+', votestrs) + if len(votes) == 1: + return int(votes[0]) + return '' + + def getTreeElement(self, tree: html.HtmlElement, expr, index=0): + """ 根据表达式从`xmltree`中获取匹配值,默认 index 为 0 + """ + return getTreeElement(tree, expr, index) + + def getTreeAll(self, tree: html.HtmlElement, expr): + """ 根据表达式从`xmltree`中获取全部匹配值 + """ + return getTreeAll(tree, expr) + + def getTreeElementbyExprs(self, tree: html.HtmlElement, expr, expr2=''): + """ 多个表达式获取element + 使用内部的 getTreeElement 防止继承修改后出现问题 + """ + try: + first = self.getTreeElement(tree, expr).strip() + if first: + return first + second = self.getTreeElement(tree, expr2).strip() + if second: + return second + return '' + except: + return '' + + def getTreeAllbyExprs(self, tree: html.HtmlElement, expr, expr2=''): + """ 多个表达式获取所有element + 合并并剔除重复元素 + """ + try: + result1 = self.getTreeAll(tree, expr) + result2 = self.getTreeAll(tree, expr2) + clean = [ x.strip() for x in result1 if x.strip() and x.strip() != ','] + clean2 = [ x.strip() for x in result2 if x.strip() and x.strip() != ','] + result = list(set(clean + clean2)) + return result + except: + return [] diff --git a/scrapinglib/tmdb.py b/scrapinglib/tmdb.py new file mode 100644 index 0000000..0856b79 --- /dev/null +++ b/scrapinglib/tmdb.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- + + +from .parser import Parser + + +class Tmdb(Parser): + """ + 两种实现,带apikey与不带key + apikey + """ + source = 'tmdb' + imagecut = 0 + apikey = None + + expr_title = '//head/meta[@property="og:title"]/@content' + expr_release = '//div/span[@class="release"]/text()' + expr_cover = '//head/meta[@property="og:image"]/@content' + expr_outline = '//head/meta[@property="og:description"]/@content' + + # def search(self, number): + # self.detailurl = self.queryNumberUrl(number) + # detailpage = self.getHtml(self.detailurl) + + def queryNumberUrl(self, number): + """ + TODO 区分 ID 与 名称 + """ + id = number + movieUrl = "https://www.themoviedb.org/movie/" + id + "?language=zh-CN" + return movieUrl + + def getCover(self, htmltree): + return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover) + diff --git a/wrapper/FreeBSD.sh b/wrapper/FreeBSD.sh new file mode 100755 index 0000000..84c1849 --- /dev/null +++ b/wrapper/FreeBSD.sh @@ -0,0 +1,12 @@ +pkg install python39 py39-requests py39-pip py39-lxml py39-pillow py39-cloudscraper py39-pysocks git zip py39-beautifulsoup448 py39-mechanicalsoup +pip install pyinstaller +pyinstaller --onefile Movie_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ + --hidden-import "ImageProcessing.cnn" \ + --python-option u \ + --add-data "$(python3.9 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ + --add-data "$(python3.9 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \ + --add-data "$(python3.9 -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1):face_recognition_models" \ + --add-data "Img:Img" \ + --add-data "config.ini:." \ + +cp config.ini ./dist diff --git a/wrapper/Linux.sh b/wrapper/Linux.sh new file mode 100755 index 0000000..abb1a80 --- /dev/null +++ b/wrapper/Linux.sh @@ -0,0 +1,24 @@ +#if [ '$(dpkg --print-architecture)' != 'amd64' ] || [ '$(dpkg --print-architecture)' != 'i386' ]; then +# apt install python3 python3-pip git sudo libxml2-dev libxslt-dev build-essential wget nano libcmocka-dev libcmocka0 -y +# apt install zlib* libjpeg-dev -y + #wget https://files.pythonhosted.org/packages/82/96/21ba3619647bac2b34b4996b2dbbea8e74a703767ce24192899d9153c058/pyinstaller-4.0.tar.gz + #tar -zxvf pyinstaller-4.0.tar.gz + #cd pyinstaller-4.0/bootloader + #sed -i "s/ '-Werror',//" wscript + #python3 ./waf distclean all + #cd ../ + #python3 setup.py install + #cd ../ +#fi +pip3 install -r requirements.txt +pip3 install cloudscraper==1.2.52 +pyinstaller --onefile Movie_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ + --hidden-import "ImageProcessing.cnn" \ + --python-option u \ + --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ + --add-data "$(python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \ + --add-data "$(python3 -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1):face_recognition_models" \ + --add-data "Img:Img" \ + --add-data "config.ini:." \ + +cp config.ini ./dist diff --git a/xlog.py b/xlog.py new file mode 100755 index 0000000..2d58061 --- /dev/null +++ b/xlog.py @@ -0,0 +1,329 @@ +import os +import sys +import time +from datetime import datetime +import traceback +import threading +import json +import shutil + +CRITICAL = 50 +FATAL = CRITICAL +ERROR = 40 +WARNING = 30 +WARN = WARNING +INFO = 20 +DEBUG = 10 +NOTSET = 0 + + +class Logger: + def __init__(self, name, buffer_size=0, file_name=None, roll_num=1): + self.err_color = '\033[0m' + self.warn_color = '\033[0m' + self.debug_color = '\033[0m' + self.reset_color = '\033[0m' + self.set_console_color = lambda color: sys.stderr.write(color) + self.name = str(name) + self.file_max_size = 1024 * 1024 + self.buffer_lock = threading.Lock() + self.buffer = {} # id => line + self.buffer_size = buffer_size + self.last_no = 0 + self.min_level = NOTSET + self.log_fd = None + self.roll_num = roll_num + if file_name: + self.set_file(file_name) + + def set_buffer(self, buffer_size): + with self.buffer_lock: + self.buffer_size = buffer_size + buffer_len = len(self.buffer) + if buffer_len > self.buffer_size: + for i in range(self.last_no - buffer_len, self.last_no - self.buffer_size): + try: + del self.buffer[i] + except: + pass + + def setLevel(self, level): + if level == "DEBUG": + self.min_level = DEBUG + elif level == "INFO": + self.min_level = INFO + elif level == "WARN": + self.min_level = WARN + elif level == "ERROR": + self.min_level = ERROR + elif level == "FATAL": + self.min_level = FATAL + else: + print(("log level not support:%s", level)) + + def set_color(self): + self.err_color = None + self.warn_color = None + self.debug_color = None + self.reset_color = None + self.set_console_color = lambda x: None + if hasattr(sys.stderr, 'isatty') and sys.stderr.isatty(): + if os.name == 'nt': + self.err_color = 0x04 + self.warn_color = 0x06 + self.debug_color = 0x002 + self.reset_color = 0x07 + + import ctypes + SetConsoleTextAttribute = ctypes.windll.kernel32.SetConsoleTextAttribute + GetStdHandle = ctypes.windll.kernel32.GetStdHandle + self.set_console_color = lambda color: SetConsoleTextAttribute(GetStdHandle(-11), color) + + elif os.name == 'posix': + self.err_color = '\033[31m' + self.warn_color = '\033[33m' + self.debug_color = '\033[32m' + self.reset_color = '\033[0m' + + self.set_console_color = lambda color: sys.stderr.write(color) + + def set_file(self, file_name): + self.log_filename = file_name + if os.path.isfile(file_name): + self.file_size = os.path.getsize(file_name) + if self.file_size > self.file_max_size: + self.roll_log() + self.file_size = 0 + else: + self.file_size = 0 + + self.log_fd = open(file_name, "a+") + + def roll_log(self): + for i in range(self.roll_num, 1, -1): + new_name = "%s.%d" % (self.log_filename, i) + old_name = "%s.%d" % (self.log_filename, i - 1) + if not os.path.isfile(old_name): + continue + + # self.info("roll_log %s -> %s", old_name, new_name) + shutil.move(old_name, new_name) + + shutil.move(self.log_filename, self.log_filename + ".1") + + def log_console(self, level, console_color, fmt, *args, **kwargs): + try: + console_string = '[%s] %s\n' % (level, fmt % args) + self.set_console_color(console_color) + sys.stderr.write(console_string) + self.set_console_color(self.reset_color) + except: + pass + + def log_to_file(self, level, console_color, fmt, *args, **kwargs): + if self.log_fd: + if level == 'e': + string = '%s' % (fmt % args) + else: + time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:23] + string = '%s [%s] [%s] %s\n' % (time_str, self.name, level, fmt % args) + + self.log_fd.write(string) + try: + self.log_fd.flush() + except: + pass + + self.file_size += len(string) + if self.file_size > self.file_max_size: + self.log_fd.close() + self.log_fd = None + self.roll_log() + self.log_fd = open(self.log_filename, "w") + self.file_size = 0 + + def log(self, level, console_color, html_color, fmt, *args, **kwargs): + self.buffer_lock.acquire() + try: + self.log_console(level, console_color, fmt, *args, **kwargs) + + self.log_to_file(level, console_color, fmt, *args, **kwargs) + + if self.buffer_size: + self.last_no += 1 + self.buffer[self.last_no] = string + buffer_len = len(self.buffer) + if buffer_len > self.buffer_size: + del self.buffer[self.last_no - self.buffer_size] + except Exception as e: + string = '%s - [%s]LOG_EXCEPT: %s, Except:%s<br> %s' % ( + time.ctime()[4:-5], level, fmt % args, e, traceback.format_exc()) + self.last_no += 1 + self.buffer[self.last_no] = string + buffer_len = len(self.buffer) + if buffer_len > self.buffer_size: + del self.buffer[self.last_no - self.buffer_size] + finally: + self.buffer_lock.release() + + def debug(self, fmt, *args, **kwargs): + if self.min_level > DEBUG: + return + self.log('-', self.debug_color, '21610b', fmt, *args, **kwargs) + + def info(self, fmt, *args, **kwargs): + if self.min_level > INFO: + return + self.log('+', self.reset_color, '000000', fmt, *args) + + def warning(self, fmt, *args, **kwargs): + if self.min_level > WARN: + return + self.log('#', self.warn_color, 'FF8000', fmt, *args, **kwargs) + + def warn(self, fmt, *args, **kwargs): + self.warning(fmt, *args, **kwargs) + + def error(self, fmt, *args, **kwargs): + if self.min_level > ERROR: + return + self.log('!', self.err_color, 'FE2E2E', fmt, *args, **kwargs) + + def exception(self, fmt, *args, **kwargs): + self.error(fmt, *args, **kwargs) + string = '%s' % (traceback.format_exc()) + self.log_to_file('e', self.err_color, string) + + def critical(self, fmt, *args, **kwargs): + if self.min_level > CRITICAL: + return + self.log('!', self.err_color, 'D7DF01', fmt, *args, **kwargs) + + def tofile(self, fmt, *args, **kwargs): + self.log_to_file('@', self.warn_color, fmt, *args, **kwargs) + + # ================================================================= + def set_buffer_size(self, set_size): + self.buffer_lock.acquire() + self.buffer_size = set_size + buffer_len = len(self.buffer) + if buffer_len > self.buffer_size: + for i in range(self.last_no - buffer_len, self.last_no - self.buffer_size): + try: + del self.buffer[i] + except: + pass + self.buffer_lock.release() + + def get_last_lines(self, max_lines): + self.buffer_lock.acquire() + buffer_len = len(self.buffer) + if buffer_len > max_lines: + first_no = self.last_no - max_lines + else: + first_no = self.last_no - buffer_len + 1 + + jd = {} + if buffer_len > 0: + for i in range(first_no, self.last_no + 1): + jd[i] = self.unicode_line(self.buffer[i]) + self.buffer_lock.release() + return json.dumps(jd) + + def get_new_lines(self, from_no): + self.buffer_lock.acquire() + jd = {} + first_no = self.last_no - len(self.buffer) + 1 + if from_no < first_no: + from_no = first_no + + if self.last_no >= from_no: + for i in range(from_no, self.last_no + 1): + jd[i] = self.unicode_line(self.buffer[i]) + self.buffer_lock.release() + return json.dumps(jd) + + def unicode_line(self, line): + try: + if type(line) is str: + return line + else: + return str(line, errors='ignore') + except Exception as e: + print(("unicode err:%r" % e)) + print(("line can't decode:%s" % line)) + print(("Except stack:%s" % traceback.format_exc())) + return "" + + +loggerDict = {} + + +def getLogger(name=None, buffer_size=0, file_name=None, roll_num=1): + global loggerDict, default_log + if name is None: + for n in loggerDict: + name = n + break + if name is None: + name = u"default" + + if not isinstance(name, str): + raise TypeError('A logger name must be string or Unicode') + if isinstance(name, bytes): + name = name.encode('utf-8') + + if name in loggerDict: + return loggerDict[name] + else: + logger_instance = Logger(name, buffer_size, file_name, roll_num) + loggerDict[name] = logger_instance + default_log = logger_instance + return logger_instance + + +default_log = getLogger() + + +def debg(fmt, *args, **kwargs): + default_log.debug(fmt, *args, **kwargs) + + +def info(fmt, *args, **kwargs): + default_log.info(fmt, *args, **kwargs) + + +def warn(fmt, *args, **kwargs): + default_log.warning(fmt, *args, **kwargs) + + +def erro(fmt, *args, **kwargs): + default_log.error(fmt, *args, **kwargs) + + +def excp(fmt, *args, **kwargs): + default_log.exception(fmt, *args, **kwargs) + + +def crit(fmt, *args, **kwargs): + default_log.critical(fmt, *args, **kwargs) + + +def tofile(fmt, *args, **kwargs): + default_log.tofile(fmt, *args, **kwargs) + + +if __name__ == '__main__': + log_file = os.path.join(os.path.dirname(sys.argv[0]), "test.log") + getLogger().set_file(log_file) + debg("debug") + info("info") + warn("warning") + erro("error") + crit("critical") + tofile("write to file only") + + try: + 1 / 0 + except Exception as e: + excp("An error has occurred")