Spaces:

uc-ctds
/

llama-data-model-generator-demo

Sleeping

App Files Files Community

llama-data-model-generator-demo / shared.py

avantol

feat(app): more examples, better parsing and error handling

cc69c66 7 months ago

raw

history blame contribute delete

10.4 kB

	"""
	Functions used in several different places. This file should not import from any other non-lib files to prevent
	circular dependencies.
	"""

	import json
	import logging
	from copy import copy
	from typing import Any, Callable, Dict, Optional, Tuple, Union

	TOP_LEVEL_IDENTIFIERS = {"description", "links", "properties"}


	def get_json_from_model_output(input_generated_json: str):
	"""
	Parses a string, potentially containing Markdown code fences, into a JSON object.

	This function attempts to extract and parse a JSON object from a string,
	often the output of a language model. It handles cases where the JSON
	is enclosed in Markdown code fences (```json ... ``` or ``` ... ```).
	If the initial parsing fails, it attempts a more robust parsing using
	`_get_valid_json_from_string` and
	logs debug messages indicating success or failure. If all attempts fail,
	it returns an empty dictionary.

	Args:
	input_generated_json: A string potentially containing a JSON object.

	Returns:
	A tuple containing:
	- The parsed JSON object (a dictionary) or an empty dictionary if parsing failed.
	- An integer representing the number of times parsing failed initially.
	"""
	originally_invalid_json_count = 0

	generated_json_attempt_1 = copy(input_generated_json)
	try:
	code_split = generated_json_attempt_1.split("```")
	if len(code_split) > 1:
	generated_json_attempt_1 = json.loads(
	("```" + code_split[1]).replace("```json", "")
	)
	else:
	generated_json_attempt_1 = json.loads(
	generated_json_attempt_1.replace("```json", "").replace("```", "")
	)
	except Exception as exc:
	logging.debug(f"could not parse AI model generated output as JSON. Exc: {exc}.")
	# originally_invalid_json_count += 1
	generated_json_attempt_1 = {}
	some_value_in_attempt_1_is_not_a_dict = check_contents_valid(
	generated_json_attempt_1
	)
	attempt_1_failed = (
	not bool(generated_json_attempt_1) or some_value_in_attempt_1_is_not_a_dict
	)
	generated_json_attempt_2 = copy(input_generated_json) if attempt_1_failed else {}
	if attempt_1_failed:
	logging.debug(
	"Attempting to make output valid to obtain better metrics (this works in limited cases where "
	"the model output was simply cut off)"
	)
	try:
	code_split = generated_json_attempt_2.split("```")
	if len(code_split) > 1:
	generated_json_attempt_2 = json.loads(
	_get_valid_json_from_string(
	("```" + code_split[1]).replace("```json", "")
	)
	)
	else:
	stripped_output = generated_json_attempt_2.replace(
	"```json", ""
	).replace("```", "")
	balance_outcome = attempt(
	json.loads, (balance_braces(stripped_output),)
	)
	if "error" not in balance_outcome:
	generated_json_attempt_2 = balance_outcome
	else:
	generated_json_attempt_2 = json.loads(
	_get_valid_json_from_string(stripped_output)
	)

	logging.debug(
	"Success! Reconstructed valid JSON from unparseable model output. Continuing metrics comparison..."
	)
	except Exception as exc:
	logging.debug(
	"Failed. Setting model output as empty JSON to enable metrics comparison."
	)
	generated_json_attempt_2 = {}
	some_value_in_attempt_2_is_not_a_dict = (
	attempt_1_failed
	and isinstance(generated_json_attempt_2, dict)
	and check_contents_valid(generated_json_attempt_2)
	)
	if some_value_in_attempt_1_is_not_a_dict and some_value_in_attempt_2_is_not_a_dict:
	logging.debug(f"Could not recover model output json, aborting!")
	originally_invalid_json_count += 1
	generated_json = (
	generated_json_attempt_1 if not attempt_1_failed else generated_json_attempt_2
	)
	return generated_json, originally_invalid_json_count


	def check_contents_valid(generated_json_attempt_1: Union[list, dict]):
	"""
	Checks that the sub nodes are not lists or anything

	Args:
	generated_json_attempt_1 (Union[list, dict]): data to check

	Returns:
	truthy based on contents of input
	"""
	if isinstance(generated_json_attempt_1, list):
	for item in generated_json_attempt_1:
	if not isinstance(item, dict):
	return item
	return None
	elif (
	isinstance(generated_json_attempt_1, dict)
	and "nodes" in generated_json_attempt_1.keys()
	):
	for item in generated_json_attempt_1.get("nodes", []):
	if not isinstance(item, dict):
	return item
	return None
	else:
	for item in generated_json_attempt_1.values():
	if not isinstance(item, dict):
	return item
	return None


	def _get_valid_json_from_string(s):
	"""
	Given a JSON string with potentially unclosed strings, arrays, or objects, close those things
	to hopefully be able to parse as valid JSON
	"""
	double_quotes = 0
	single_quotes = 0
	brackets = []

	for i, c in enumerate(s):
	if c == '"':
	double_quotes = 1 - double_quotes # Toggle between 0 and 1
	elif c == "'":
	single_quotes = 1 - single_quotes # Toggle between 0 and 1
	elif c in "{[":
	brackets.append((i, c))
	elif c in "}]":
	if double_quotes == 0 and single_quotes == 0:
	if brackets:
	last_opened = brackets.pop()
	if (c == "}" and last_opened[1] != "{") or (
	c == "]" and last_opened[1] != "["
	):
	raise ValueError(
	f"Mismatched brackets/quotes found: opened {last_opened[1]} @ {last_opened[0]} "
	f"but closed {c} @ {i}"
	)
	else:
	# If no matching opening bracket, it's an error, but we can skip this for the task
	pass

	# Remove trailing comma if it exists
	if s.strip().endswith(","):
	logging.debug("Removing ending ,")
	s = s.strip().rstrip(",")

	closing_chars = ""

	# Adding closing quotes if there are missing ones
	if double_quotes > 0:
	closing_chars += '"'
	if single_quotes > 0:
	closing_chars += "'"

	# Add closing brackets for any unmatched opening brackets
	while brackets:
	last_opened = brackets.pop()
	if last_opened[1] == "{":
	closing_chars += "}"
	elif last_opened[1] == "[":
	closing_chars += "]"

	logging.debug(f"closing_chars: {closing_chars}")

	output_string = s + closing_chars

	try:
	json.loads(output_string)
	except Exception:
	logging.debug(
	"JSON string still fails to be parseable, attempting another modification..."
	)
	# it's possible the closing quotes were on a property that didn't have a value, let's
	# fix that and see if it works
	new_closing_chars = ""
	found_first_double_quote = False
	for char in closing_chars:
	if not found_first_double_quote and char == '"':
	# for keys in objects with no value, append an empty value
	#
	# For example:
	# ```
	# {
	# "properties": {
	# "annotation
	# ```
	new_closing_chars += '": ""'
	else:
	new_closing_chars += char

	logging.debug(f"new closing_chars: {new_closing_chars}")
	output_string = s + new_closing_chars

	return output_string


	def on_fail(
	outcome: Union[Any, Dict[str, str]],
	fallback: Union[Any, Callable] = None,
	):
	"""
	Allows you to provide a fallback to recover from a failed outcome.

	Args:
	outcome
	fallback

	Returns:

	"""
	is_fail = isinstance(outcome, dict) and "error" in outcome
	is_callable = isinstance(fallback, Callable)
	if is_fail and is_callable:
	return fallback(outcome)
	elif is_fail:
	return fallback
	return outcome


	def attempt(
	func: Callable,
	args: Tuple[Any, ...] = (),
	kwargs: Optional[Dict[str, Any]] = None,
	) -> Union[Any, Dict[str, str]]:
	"""
	Attempts to execute a function with the provided arguments.

	If the function raises an exception, the exception is caught and returned in a dict.
	Args:
	func (Callable): The function to execute.
	args (Tuple[Any, ...], optional): A tuple of positional arguments for the function.
	kwargs (Optional[Dict[str, Any]], optional): A dictionary of keyword arguments for the function.
	Returns:
	Function result or {"error": <msg>} response
	"""
	kwargs = kwargs or {}
	try:
	return func(args, *kwargs)
	except Exception as exc:
	return {"error": str(exc)}


	def balance_braces(s: str) -> str:
	"""
	Primitive function that just tries to add '{}' style braces to try to recover
	the model string.

	Args:
	s(str): string to balance braces on.

	Returns:
	provided string with balanced braces if possible
	"""
	open_count = s.count("{")
	close_count = s.count("}")

	if open_count > close_count:
	s += "}" * (open_count - close_count)
	elif close_count > open_count:
	s = "{" * (close_count - open_count) + s

	return s


	def flatten_list(coll):
	flattened_data = []
	for set_list in coll:
	flattened_data = flattened_data + list(set_list)
	return flattened_data


	def keep_errors(collection):
	"""
	Given a set of outcomes, keeps any that resulted in an error

	Args:
	collection (Collection): collection of outcomes to filter.

	Returns:
	All instances of the collection that contain an error response.
	"""
	return [instance for instance in collection if "error" in (instance or [])]