From 0dad9aee7bac49ad5c34d9a49511da25d00a6d4c Mon Sep 17 00:00:00 2001 From: Sebastian Golasch Date: Mon, 27 Feb 2017 13:54:06 +0100 Subject: [PATCH] fix(userdata): Dropped sloppy JSOn parser in favour of a true JavaScript AST analyzer --- resources/lib/NetflixSession.py | 136 +- resources/lib/pyjsparser/__init__.py | 4 + resources/lib/pyjsparser/parser.py | 2902 ++++++++++++++++++++ resources/lib/pyjsparser/pyjsparserdata.py | 303 ++ resources/lib/pyjsparser/std_nodes.py | 471 ++++ resources/lib/utils.py | 17 - 6 files changed, 3769 insertions(+), 64 deletions(-) create mode 100644 resources/lib/pyjsparser/__init__.py create mode 100644 resources/lib/pyjsparser/parser.py create mode 100644 resources/lib/pyjsparser/pyjsparserdata.py create mode 100644 resources/lib/pyjsparser/std_nodes.py diff --git a/resources/lib/NetflixSession.py b/resources/lib/NetflixSession.py index d116911..46f2f2d 100644 --- a/resources/lib/NetflixSession.py +++ b/resources/lib/NetflixSession.py @@ -15,7 +15,7 @@ try: except: import pickle from bs4 import BeautifulSoup -from utils import strip_tags +from pyjsparser import PyJsParser from utils import noop class NetflixSession: @@ -145,53 +145,57 @@ class NetflixSession: def extract_inline_netflix_page_data (self, page_soup): """Extracts all - - So we´re extracting every JavaScript object contained in the `netflix.x = {};` variable, - strip all html tags, unescape the whole thing & finally parse the resulting serialized JSON from this - operations. Errors are expected, as not all + We use a JS parser to generate an AST of the code given & then parse that AST into a python dict. + This should be okay, as we´re only interested in a few static values & put the rest aside Parameters ---------- page_soup : :obj:`BeautifulSoup` Instance of an BeautifulSoup document or node containing the complete page contents - Returns ------- :obj:`list` of :obj:`dict` List of all the serialized data pulled out of the pagws ', '').strip() - # unescape the contents as they contain characters a JSON parser chokes up upon - unescaped_data = stripped_data.decode('string_escape') - # strip all the HTML tags within the strings a JSON parser chokes up upon them - transformed_data = strip_tags(unescaped_data) - # parse the contents with a regular JSON parser, as they should be in a shape that ot actually works - try: - parsed_data = json.loads(transformed_data) - inline_data.append(parsed_data) - except ValueError, e: - noop() - except TypeError, e: - noop() - - return inline_data; + data = {}; + # unicode escape that incoming script stuff + contents = self._to_unicode(str(script.contents[0])) + # parse the JS & load the declarations we´re interested in + declarations = parser.parse(contents)['body'][1]['expression']['right']['properties']; + for declaration in declarations: + for key in declaration: + # we found the correct path if the declaration is a dict & of type 'ObjectExpression' + if type(declaration[key]) is dict: + if declaration[key]['type'] == 'ObjectExpression': + # add all static data recursivly + for expression in declaration[key]['properties']: + data[expression['key']['value']] = self._parse_rec(expression['value']) + inline_data.append(data) + return inline_data + + def _parse_rec (self, node): + """Iterates over a JavaScript AST and retu values found + Parameters + ---------- + value : :obj:`dict` + JS AST Expression + Returns + ------- + :obj:`dict` of :obj:`dict` or :obj:`str` + Parsed contents of the node + """ + if node['type'] == 'ObjectExpression': + _ret = {} + for prop in node['properties']: + _ret.update({prop['key']['value']: self._parse_rec(prop['value'])}) + return _ret + if node['type'] == 'Literal': + return node['value'] def _parse_user_data (self, netflix_page_data): """Parse out the user data from the big chunk of dicts we got from @@ -236,9 +240,12 @@ class NetflixSession: 'pinEnabled' ] for item in netflix_page_data: - if 'models' in dict(item).keys(): + if 'memberContext' in dict(item).keys(): for important_field in important_fields: - user_data.update({important_field: item['models']['userInfo']['data'][important_field]}) + user_data.update({important_field: item['memberContext']['data']['userInfo'][important_field]}) + print '.............' + print user_data + print '.............' return user_data def _parse_profile_data (self, netflix_page_data): @@ -275,17 +282,19 @@ class NetflixSession: ] # TODO: get rid of this christmas tree of doom for item in netflix_page_data: - if 'profiles' in dict(item).keys(): - for profile_id in item['profiles']: - if self._is_size_key(key=profile_id) == False: + if 'hasViewedRatingWelcomeModal' in dict(item).keys(): + for profile_id in item: + print '------------' + print profile_id + print '------------' + if self._is_size_key(key=profile_id) == False and type(item[profile_id]) == dict and item[profile_id].get('avatar', False) != False: profile = {'id': profile_id} for important_field in important_fields: - profile.update({important_field: item['profiles'][profile_id]['summary'][important_field]}) - avatar_base = item['avatars']['nf'].get(item['profiles'][profile_id]['summary']['avatarName'], False); + profile.update({important_field: item[profile_id]['summary'][important_field]}) + avatar_base = item['nf'].get(item[profile_id]['summary']['avatarName'], False); avatar = 'https://secure.netflix.com/ffe/profiles/avatars_v2/320x320/PICON_029.png' if avatar_base == False else avatar_base['images']['byWidth']['320']['value'] profile.update({'avatar': avatar}) profiles.update({profile_id: profile}) - return profiles def _parse_api_base_data (self, netflix_page_data): @@ -317,9 +326,9 @@ class NetflixSession: 'ICHNAEA_ROOT' ] for item in netflix_page_data: - if 'models' in dict(item).keys(): + if 'serverDefs' in dict(item).keys(): for important_field in important_fields: - api_data.update({important_field: item['models']['serverDefs']['data'][important_field]}) + api_data.update({important_field: item['serverDefs']['data'][important_field]}) return api_data def _parse_esn_data (self, netflix_page_data): @@ -339,8 +348,8 @@ class NetflixSession: """ esn = ''; for item in netflix_page_data: - if 'models' in dict(item).keys(): - esn = item['models']['esnGeneratorModel']['data']['esn'] + if 'esnGeneratorModel' in dict(item).keys(): + esn = item['esnGeneratorModel']['data']['esn'] return esn def _parse_page_contents (self, page_soup): @@ -357,6 +366,11 @@ class NetflixSession: self.esn = self._parse_esn_data(netflix_page_data=netflix_page_data) self.api_data = self._parse_api_base_data(netflix_page_data=netflix_page_data) self.profiles = self._parse_profile_data(netflix_page_data=netflix_page_data) + if self.user_data.get('bauthURL', False) == False: + print '...............' + print page_soup.text.find('authURL'); + print '...............' + def is_logged_in (self, account): """Determines if a user is already logged in (with a valid cookie), @@ -1850,6 +1864,34 @@ class NetflixSession: # return the parsed response & everything´s fine return response.json() + def _to_unicode(self, str): + '''Attempt to fix non uft-8 string into utf-8, using a limited set of encodings + + Parameters + ---------- + str : `str` + String to decode + + Returns + ------- + `str` + Decoded string + ''' + # fuller list of encodings at http://docs.python.org/library/codecs.html#standard-encodings + if not str: return u'' + u = None + # we could add more encodings here, as warranted. + encodings = ('ascii', 'utf8', 'latin1') + for enc in encodings: + if u: break + try: + u = unicode(str,enc) + except UnicodeDecodeError: + pass + if not u: + u = unicode(str, errors='replace') + return u + def _update_my_list (self, video_id, operation): """Tiny helper to add & remove items from "my list" diff --git a/resources/lib/pyjsparser/__init__.py b/resources/lib/pyjsparser/__init__.py new file mode 100644 index 0000000..07eae8f --- /dev/null +++ b/resources/lib/pyjsparser/__init__.py @@ -0,0 +1,4 @@ +__all__ = ['PyJsParser', 'parse', 'JsSyntaxError'] +__author__ = 'Piotr Dabkowski' +__version__ = '2.2.0' +from .parser import PyJsParser, parse, JsSyntaxError diff --git a/resources/lib/pyjsparser/parser.py b/resources/lib/pyjsparser/parser.py new file mode 100644 index 0000000..b483a97 --- /dev/null +++ b/resources/lib/pyjsparser/parser.py @@ -0,0 +1,2902 @@ +# The MIT License +# +# Copyright 2014, 2015 Piotr Dabkowski +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the 'Software'), +# to deal in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, subject +# to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or +# substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT +# LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE +from __future__ import unicode_literals +from .pyjsparserdata import * +from .std_nodes import * +from pprint import pprint +import sys + +__all__ = ['PyJsParser', 'parse', 'ENABLE_JS2PY_ERRORS', 'ENABLE_PYIMPORT', 'JsSyntaxError'] +REGEXP_SPECIAL_SINGLE = ('\\', '^', '$', '*', '+', '?', '.', '[', ']', '(', ')', '{', '{', '|', '-') +ENABLE_PYIMPORT = False +ENABLE_JS2PY_ERRORS = False + +PY3 = sys.version_info >= (3,0) + +if PY3: + basestring = str + long = int + xrange = range + unicode = str + +ESPRIMA_VERSION = '2.2.0' +DEBUG = False +# Small naming convention changes +# len -> leng +# id -> d +# type -> typ +# str -> st +true = True +false = False +null = None + + +class PyJsParser: + """ Usage: + parser = PyJsParser() + parser.parse('var JavaScriptCode = 5.1') + """ + + def __init__(self): + self.clean() + + def test(self, code): + pprint(self.parse(code)) + + def clean(self): + self.strict = None + self.sourceType = None + self.index = 0 + self.lineNumber = 1 + self.lineStart = 0 + self.hasLineTerminator = None + self.lastIndex = None + self.lastLineNumber = None + self.lastLineStart = None + self.startIndex = None + self.startLineNumber = None + self.startLineStart = None + self.scanning = None + self.lookahead = None + self.state = None + self.extra = None + self.isBindingElement = None + self.isAssignmentTarget = None + self.firstCoverInitializedNameError = None + + # 7.4 Comments + + def skipSingleLineComment(self, offset): + start = self.index - offset; + while self.index < self.length: + ch = self.source[self.index]; + self.index += 1 + if isLineTerminator(ch): + if (ord(ch) == 13 and ord(self.source[self.index]) == 10): + self.index += 1 + self.lineNumber += 1 + self.hasLineTerminator = True + self.lineStart = self.index + return + + def skipMultiLineComment(self): + while self.index < self.length: + ch = ord(self.source[self.index]) + if isLineTerminator(ch): + if (ch == 0x0D and ord(self.source[self.index + 1]) == 0x0A): + self.index += 1 + self.lineNumber += 1 + self.index += 1 + self.hasLineTerminator = True + self.lineStart = self.index + elif ch == 0x2A: + # Block comment ends with '*/'. + if ord(self.source[self.index + 1]) == 0x2F: + self.index += 2 + return + self.index += 1 + else: + self.index += 1 + self.tolerateUnexpectedToken() + + def skipComment(self): + self.hasLineTerminator = False + start = (self.index == 0) + while self.index < self.length: + ch = ord(self.source[self.index]) + if isWhiteSpace(ch): + self.index += 1 + elif isLineTerminator(ch): + self.hasLineTerminator = True + self.index += 1 + if (ch == 0x0D and ord(self.source[self.index]) == 0x0A): + self.index += 1 + self.lineNumber += 1 + self.lineStart = self.index + start = True + elif (ch == 0x2F): # U+002F is '/' + ch = ord(self.source[self.index + 1]) + if (ch == 0x2F): + self.index += 2 + self.skipSingleLineComment(2) + start = True + elif (ch == 0x2A): # U+002A is '*' + self.index += 2 + self.skipMultiLineComment() + else: + break + elif (start and ch == 0x2D): # U+002D is '-' + # U+003E is '>' + if (ord(self.source[self.index + 1]) == 0x2D) and (ord(self.source[self.index + 2]) == 0x3E): + # '-->' is a single-line comment + self.index += 3 + self.skipSingleLineComment(3) + else: + break + elif (ch == 0x3C): # U+003C is '<' + if self.source[self.index + 1: self.index + 4] == '!--': + #