X-Git-Url: http://git.code-monkey.de/?a=blobdiff_plain;f=resources%2Flib%2FNetflixSession.py;h=60625a3cc1d5eae729fed63b06cf2bb9b5bc2521;hb=08aa1fe9088e94596cbd40055bf63457a7617db8;hp=d1169117c2536c3e2e53623ff6ee731a87fc7395;hpb=1d8369ef0de8ea43ac783282d50f4844f486d01d;p=plugin.video.netflix.git diff --git a/resources/lib/NetflixSession.py b/resources/lib/NetflixSession.py index d116911..60625a3 100644 --- a/resources/lib/NetflixSession.py +++ b/resources/lib/NetflixSession.py @@ -14,8 +14,8 @@ try: import cPickle as pickle except: import pickle -from bs4 import BeautifulSoup -from utils import strip_tags +from bs4 import BeautifulSoup, SoupStrainer +from pyjsparser import PyJsParser from utils import noop class NetflixSession: @@ -145,53 +145,57 @@ class NetflixSession: def extract_inline_netflix_page_data (self, page_soup): """Extracts all - - So we´re extracting every JavaScript object contained in the `netflix.x = {};` variable, - strip all html tags, unescape the whole thing & finally parse the resulting serialized JSON from this - operations. Errors are expected, as not all + We use a JS parser to generate an AST of the code given & then parse that AST into a python dict. + This should be okay, as we´re only interested in a few static values & put the rest aside Parameters ---------- page_soup : :obj:`BeautifulSoup` Instance of an BeautifulSoup document or node containing the complete page contents - Returns ------- :obj:`list` of :obj:`dict` List of all the serialized data pulled out of the pagws ', '').strip() - # unescape the contents as they contain characters a JSON parser chokes up upon - unescaped_data = stripped_data.decode('string_escape') - # strip all the HTML tags within the strings a JSON parser chokes up upon them - transformed_data = strip_tags(unescaped_data) - # parse the contents with a regular JSON parser, as they should be in a shape that ot actually works - try: - parsed_data = json.loads(transformed_data) - inline_data.append(parsed_data) - except ValueError, e: - noop() - except TypeError, e: - noop() - - return inline_data; + data = {}; + # unicode escape that incoming script stuff + contents = self._to_unicode(str(script.contents[0])) + # parse the JS & load the declarations we´re interested in + declarations = parser.parse(contents)['body'][1]['expression']['right']['properties']; + for declaration in declarations: + for key in declaration: + # we found the correct path if the declaration is a dict & of type 'ObjectExpression' + if type(declaration[key]) is dict: + if declaration[key]['type'] == 'ObjectExpression': + # add all static data recursivly + for expression in declaration[key]['properties']: + data[expression['key']['value']] = self._parse_rec(expression['value']) + inline_data.append(data) + return inline_data + + def _parse_rec (self, node): + """Iterates over a JavaScript AST and retu values found + Parameters + ---------- + value : :obj:`dict` + JS AST Expression + Returns + ------- + :obj:`dict` of :obj:`dict` or :obj:`str` + Parsed contents of the node + """ + if node['type'] == 'ObjectExpression': + _ret = {} + for prop in node['properties']: + _ret.update({prop['key']['value']: self._parse_rec(prop['value'])}) + return _ret + if node['type'] == 'Literal': + return node['value'] def _parse_user_data (self, netflix_page_data): """Parse out the user data from the big chunk of dicts we got from @@ -236,9 +240,9 @@ class NetflixSession: 'pinEnabled' ] for item in netflix_page_data: - if 'models' in dict(item).keys(): + if 'memberContext' in dict(item).keys(): for important_field in important_fields: - user_data.update({important_field: item['models']['userInfo']['data'][important_field]}) + user_data.update({important_field: item['memberContext']['data']['userInfo'][important_field]}) return user_data def _parse_profile_data (self, netflix_page_data): @@ -275,17 +279,16 @@ class NetflixSession: ] # TODO: get rid of this christmas tree of doom for item in netflix_page_data: - if 'profiles' in dict(item).keys(): - for profile_id in item['profiles']: - if self._is_size_key(key=profile_id) == False: + if 'hasViewedRatingWelcomeModal' in dict(item).keys(): + for profile_id in item: + if self._is_size_key(key=profile_id) == False and type(item[profile_id]) == dict and item[profile_id].get('avatar', False) != False: profile = {'id': profile_id} for important_field in important_fields: - profile.update({important_field: item['profiles'][profile_id]['summary'][important_field]}) - avatar_base = item['avatars']['nf'].get(item['profiles'][profile_id]['summary']['avatarName'], False); + profile.update({important_field: item[profile_id]['summary'][important_field]}) + avatar_base = item['nf'].get(item[profile_id]['summary']['avatarName'], False); avatar = 'https://secure.netflix.com/ffe/profiles/avatars_v2/320x320/PICON_029.png' if avatar_base == False else avatar_base['images']['byWidth']['320']['value'] profile.update({'avatar': avatar}) profiles.update({profile_id: profile}) - return profiles def _parse_api_base_data (self, netflix_page_data): @@ -317,9 +320,9 @@ class NetflixSession: 'ICHNAEA_ROOT' ] for item in netflix_page_data: - if 'models' in dict(item).keys(): + if 'serverDefs' in dict(item).keys(): for important_field in important_fields: - api_data.update({important_field: item['models']['serverDefs']['data'][important_field]}) + api_data.update({important_field: item['serverDefs']['data'][important_field]}) return api_data def _parse_esn_data (self, netflix_page_data): @@ -339,8 +342,8 @@ class NetflixSession: """ esn = ''; for item in netflix_page_data: - if 'models' in dict(item).keys(): - esn = item['models']['esnGeneratorModel']['data']['esn'] + if 'esnGeneratorModel' in dict(item).keys(): + esn = item['esnGeneratorModel']['data']['esn'] return esn def _parse_page_contents (self, page_soup): @@ -357,6 +360,7 @@ class NetflixSession: self.esn = self._parse_esn_data(netflix_page_data=netflix_page_data) self.api_data = self._parse_api_base_data(netflix_page_data=netflix_page_data) self.profiles = self._parse_profile_data(netflix_page_data=netflix_page_data) + return netflix_page_data def is_logged_in (self, account): """Determines if a user is already logged in (with a valid cookie), @@ -383,9 +387,9 @@ class NetflixSession: response = self.session.get(self._get_document_url_for(component='profiles'), verify=self.verify_ssl) # parse out the needed inline information - page_soup = BeautifulSoup(response.text) - page_data = self.extract_inline_netflix_page_data(page_soup=page_soup) - self._parse_page_contents(page_soup=page_soup) + only_script_tags = SoupStrainer('script') + page_soup = BeautifulSoup(response.text, 'html.parser', parse_only=only_script_tags) + page_data = self._parse_page_contents(page_soup=page_soup) # check if the cookie is still valid for item in page_data: @@ -429,7 +433,7 @@ class NetflixSession: return False; # collect all the login fields & their contents and add the user credentials - page_soup = BeautifulSoup(response.text) + page_soup = BeautifulSoup(response.text, 'html.parser') login_form = page_soup.find(attrs={'class' : 'ui-label-text'}).findPrevious('form') login_payload = self.parse_login_form_fields(form_soup=login_form) if 'email' in login_payload: @@ -440,7 +444,7 @@ class NetflixSession: # perform the login login_response = self.session.post(self._get_document_url_for(component='login'), data=login_payload, verify=self.verify_ssl) - login_soup = BeautifulSoup(login_response.text) + login_soup = BeautifulSoup(login_response.text, 'html.parser') # we know that the login was successfull if we find an HTML element with the class of 'profile-name' if login_soup.find(attrs={'class' : 'profile-name'}) or login_soup.find(attrs={'class' : 'profile-icon'}): @@ -483,9 +487,11 @@ class NetflixSession: # fetch the index page again, so that we can fetch the corresponding user data browse_response = self.session.get(self._get_document_url_for(component='browse'), verify=self.verify_ssl) - browse_soup = BeautifulSoup(browse_response.text) + only_script_tags = SoupStrainer('script') + browse_soup = BeautifulSoup(browse_response.text, 'html.parser', parse_only=only_script_tags) self._parse_page_contents(page_soup=browse_soup) account_hash = self._generate_account_hash(account=account) + self.user_data['guid'] = profile_id; self._save_data(filename=self.data_path + '_' + account_hash) return True @@ -1161,11 +1167,11 @@ class NetflixSession: :obj:`str` Quality of the video """ - quality = '540' + quality = '720' if video['videoQuality']['hasHD']: - quality = '720' - if video['videoQuality']['hasUltraHD']: quality = '1080' + if video['videoQuality']['hasUltraHD']: + quality = '4000' return quality def parse_runtime_for_video (self, video): @@ -1312,8 +1318,14 @@ class NetflixSession: for key in videos.keys(): if self._is_size_key(key=key) == False: video_key = key + # get season index + sorting = {} + for idx in videos[video_key]['seasonList']: + if self._is_size_key(key=idx) == False and idx != 'summary': + sorting[int(videos[video_key]['seasonList'][idx][1])] = int(idx) return { season['summary']['id']: { + 'idx': sorting[season['summary']['id']], 'id': season['summary']['id'], 'text': season['summary']['name'], 'shortName': season['summary']['shortName'], @@ -1467,7 +1479,7 @@ class NetflixSession: Instance of an BeautifulSoup document containing the complete page contents """ response = self.session.get(self._get_document_url_for(component='browse'), verify=self.verify_ssl) - return BeautifulSoup(response.text) + return BeautifulSoup(response.text, 'html.parser') def fetch_video_list_ids (self, list_from=0, list_to=50): """Fetches the JSON with detailed information based on the lists on the landing page (browse page) of Netflix @@ -1735,9 +1747,9 @@ class NetflixSession: # load the profiles page (to verify the user) response = self.session.get(self._get_document_url_for(component='profiles'), verify=self.verify_ssl) # parse out the needed inline information - page_soup = BeautifulSoup(response.text) - page_data = self.extract_inline_netflix_page_data(page_soup=page_soup) - self._parse_page_contents(page_soup) + only_script_tags = SoupStrainer('script') + page_soup = BeautifulSoup(response.text, 'html.parser', parse_only=only_script_tags) + page_data = self._parse_page_contents(page_soup=page_soup) account_hash = self._generate_account_hash(account=account) self._save_data(filename=self.data_path + '_' + account_hash) @@ -1850,6 +1862,34 @@ class NetflixSession: # return the parsed response & everything´s fine return response.json() + def _to_unicode(self, str): + '''Attempt to fix non uft-8 string into utf-8, using a limited set of encodings + + Parameters + ---------- + str : `str` + String to decode + + Returns + ------- + `str` + Decoded string + ''' + # fuller list of encodings at http://docs.python.org/library/codecs.html#standard-encodings + if not str: return u'' + u = None + # we could add more encodings here, as warranted. + encodings = ('ascii', 'utf8', 'latin1') + for enc in encodings: + if u: break + try: + u = unicode(str,enc) + except UnicodeDecodeError: + pass + if not u: + u = unicode(str, errors='replace') + return u + def _update_my_list (self, video_id, operation): """Tiny helper to add & remove items from "my list"