- # ugly part: try to parse the data & don't care about errors (as they will be some)
- try:
- # find the first occurance of the 'netflix.' string, assigning the contents to a global js var
- str_index = str(script).find('netflix.')
- # filter out the contents between the 'netflix.x =' & ';<script>'
- stripped_data = str(script)[str_index:][(str(script)[str_index:].find('= ') + 2):].replace(';</script>', '').strip()
- # unescape the contents as they contain characters a JSON parser chokes up upon
- unescaped_data = stripped_data.decode('string_escape')
- # strip all the HTML tags within the strings a JSON parser chokes up upon them
- transformed_data = strip_tags(unescaped_data)
- # parse the contents with a regular JSON parser, as they should be in a shape that ot actually works
- try:
- parsed_data = json.loads(transformed_data)
- inline_data.append(parsed_data)
- except ValueError, e:
- noop()
- except TypeError, e:
- noop()
-
- return inline_data;
+ data = {};
+ # unicode escape that incoming script stuff
+ contents = self._to_unicode(str(script.contents[0]))
+ # parse the JS & load the declarations we´re interested in
+ declarations = parser.parse(contents)['body'][1]['expression']['right']['properties'];
+ for declaration in declarations:
+ for key in declaration:
+ # we found the correct path if the declaration is a dict & of type 'ObjectExpression'
+ if type(declaration[key]) is dict:
+ if declaration[key]['type'] == 'ObjectExpression':
+ # add all static data recursivly
+ for expression in declaration[key]['properties']:
+ data[expression['key']['value']] = self._parse_rec(expression['value'])
+ inline_data.append(data)
+ return inline_data
+
+ def _parse_rec (self, node):
+ """Iterates over a JavaScript AST and retu values found
+ Parameters
+ ----------
+ value : :obj:`dict`
+ JS AST Expression
+ Returns
+ -------
+ :obj:`dict` of :obj:`dict` or :obj:`str`
+ Parsed contents of the node
+ """
+ if node['type'] == 'ObjectExpression':
+ _ret = {}
+ for prop in node['properties']:
+ _ret.update({prop['key']['value']: self._parse_rec(prop['value'])})
+ return _ret
+ if node['type'] == 'Literal':
+ return node['value']