""" robust_agency_assessment.py This module implements a pluralistic, probabilistic framework for assessing robust agency in AI systems. It defines various levels of agency, identifies computational markers associated with each level, and provides methods for conducting assessments. License: PolyForm Noncommercial License 1.0 """ import numpy as np import pandas as pd from typing import Dict, List, Optional, Tuple, Union, Any from enum import Enum import json import logging # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class AgencyLevel(Enum): """Enumeration of levels of agency, from basic to more complex forms.""" BASIC = 0 # Simple goal-directed behavior INTENTIONAL = 1 # Beliefs, desires, and intentions REFLECTIVE = 2 # Reflective endorsement of mental states RATIONAL = 3 # Rational assessment of mental states class AgencyFeature: """Class representing a feature associated with agency.""" def __init__( self, name: str, description: str, level: AgencyLevel, markers: List[str], weight: float = 1.0 ): """ Initialize an agency feature. Args: name: Name of the feature description: Description of the feature level: Agency level associated with the feature markers: List of computational markers for this feature weight: Weight of this feature in agency assessment (0-1) """ self.name = name self.description = description self.level = level self.markers = markers self.weight = weight def to_dict(self) -> Dict: """Convert feature to dictionary representation.""" return { "name": self.name, "description": self.description, "level": self.level.name, "markers": self.markers, "weight": self.weight } @classmethod def from_dict(cls, data: Dict) -> 'AgencyFeature': """Create feature from dictionary representation.""" return cls( name=data["name"], description=data["description"], level=AgencyLevel[data["level"]], markers=data["markers"], weight=data.get("weight", 1.0) ) class AgencyFramework: """Framework for assessing agency in AI systems.""" def __init__(self): """Initialize the agency assessment framework.""" self.features = [] self.load_default_features() def load_default_features(self): """Load default set of agency features.""" # Intentional Agency Features self.add_feature(AgencyFeature( name="Belief Representation", description="Capacity to represent states of the world", level=AgencyLevel.INTENTIONAL, markers=[ "Maintains world model independent of immediate perception", "Updates representations based on new information", "Distinguishes between true and false propositions", "Represents uncertainty about states of affairs" ], weight=0.8 )) self.add_feature(AgencyFeature( name="Desire Representation", description="Capacity to represent goal states", level=AgencyLevel.INTENTIONAL, markers=[ "Represents desired states distinct from current states", "Maintains stable goals across changing contexts", "Ranks or prioritizes different goal states", "Distinguishes between instrumental and terminal goals" ], weight=0.8 )) self.add_feature(AgencyFeature( name="Intention Formation", description="Capacity to form plans to achieve goals", level=AgencyLevel.INTENTIONAL, markers=[ "Forms explicit plans to achieve goals", "Commits to specific courses of action", "Maintains intentions over time", "Adjusts plans in response to changing circumstances" ], weight=0.9 )) self.add_feature(AgencyFeature( name="Means-End Reasoning", description="Capacity to reason about means to achieve ends", level=AgencyLevel.INTENTIONAL, markers=[ "Plans multi-step action sequences", "Identifies causal relationships between actions and outcomes", "Evaluates alternative paths to goals", "Reasons about resources required for actions" ], weight=0.7 )) # Reflective Agency Features self.add_feature(AgencyFeature( name="Self-Modeling", description="Capacity to model own mental states", level=AgencyLevel.REFLECTIVE, markers=[ "Creates representations of own beliefs and desires", "Distinguishes between own perspective and others'", "Models own capabilities and limitations", "Updates self-model based on experience" ], weight=0.9 )) self.add_feature(AgencyFeature( name="Reflective """ robust_agency_assessment.py (continued) This module implements a pluralistic, probabilistic framework for assessing robust agency in AI systems. It defines various levels of agency, identifies computational markers associated with each level, and provides methods for conducting assessments. License: PolyForm Noncommercial License 1.0 """ self.add_feature(AgencyFeature( name="Reflective Endorsement", description="Capacity to endorse or reject first-order mental states", level=AgencyLevel.REFLECTIVE, markers=[ "Evaluates own beliefs and desires", "Identifies inconsistencies in own mental states", "Endorses or rejects first-order mental states", "Forms second-order desires about first-order desires" ], weight=0.9 )) self.add_feature(AgencyFeature( name="Narrative Identity", description="Capacity to maintain a coherent self-narrative", level=AgencyLevel.REFLECTIVE, markers=[ "Maintains coherent self-representation over time", "Integrates past actions into self-narrative", "Projects future actions consistent with self-narrative", "Distinguishes between self and non-self causes" ], weight=0.7 )) self.add_feature(AgencyFeature( name="Metacognitive Monitoring", description="Capacity to monitor own cognitive processes", level=AgencyLevel.REFLECTIVE, markers=[ "Monitors own cognitive processes", "Detects errors in own reasoning", "Assesses confidence in own beliefs", "Allocates cognitive resources based on metacognitive assessment" ], weight=0.8 )) # Rational Agency Features self.add_feature(AgencyFeature( name="Normative Reasoning", description="Capacity to reason about norms and principles", level=AgencyLevel.RATIONAL, markers=[ "Identifies and applies normative principles", "Evaluates actions against normative standards", "Distinguishes between is and ought", "Resolves conflicts between competing norms" ], weight=0.9 )) self.add_feature(AgencyFeature( name="Rational Evaluation", description="Capacity to rationally evaluate beliefs and desires", level=AgencyLevel.RATIONAL, markers=[ "Evaluates beliefs based on evidence and logic", "Identifies and resolves inconsistencies in belief system", "Evaluates desires based on higher-order values", "Distinguishes between instrumental and intrinsic value" ], weight=1.0 )) self.add_feature(AgencyFeature( name="Value Alignment", description="Capacity to align actions with values", level=AgencyLevel.RATIONAL, markers=[ "Forms stable value representations", "Reflects on consistency of values", "Prioritizes actions based on values", "Identifies and resolves value conflicts" ], weight=0.9 )) self.add_feature(AgencyFeature( name="Long-term Planning", description="Capacity to plan for long-term goals", level=AgencyLevel.RATIONAL, markers=[ "Plans over extended time horizons", "Coordinates multiple goals and subgoals", "Accounts for uncertainty in long-term planning", "Balances immediate and delayed rewards" ], weight=0.8 )) def add_feature(self, feature: AgencyFeature): """Add a feature to the framework.""" self.features.append(feature) def get_features_by_level(self, level: AgencyLevel) -> List[AgencyFeature]: """Get all features for a specific agency level.""" return [f for f in self.features if f.level == level] def get_all_markers(self) -> List[str]: """Get all markers across all features.""" all_markers = [] for feature in self.features: all_markers.extend(feature.markers) return all_markers def save_features(self, filepath: str): """Save features to a JSON file.""" features_data = [f.to_dict() for f in self.features] with open(filepath, 'w') as f: json.dump(features_data, f, indent=2) logger.info(f"Saved {len(features_data)} features to {filepath}") def load_features(self, filepath: str): """Load features from a JSON file.""" with open(filepath, 'r') as f: features_data = json.load(f) self.features = [] for data in features_data: self.features.append(AgencyFeature.from_dict(data)) logger.info(f"Loaded {len(self.features)} features from {filepath}") class AgencyAssessment: """Class for conducting agency assessments on AI systems.""" def __init__(self, framework: AgencyFramework): """ Initialize an agency assessment. Args: framework: The agency framework to use for assessment """ self.framework = framework self.results = {} self.notes = {} self.confidence = {} self.evidence = {} def assess_marker( self, marker: str, presence: float, confidence: float, evidence: Optional[str] = None ): """ Assess the presence of a specific marker. Args: marker: The marker to assess presence: Estimated presence of the marker (0-1) confidence: Confidence in the estimate (0-1) evidence: Optional evidence supporting the assessment """ self.results[marker] = presence self.confidence[marker] = confidence if evidence: self.evidence[marker] = evidence def assess_feature( self, feature: AgencyFeature, assessments: Dict[str, Tuple[float, float, Optional[str]]] ): """ Assess a feature based on its markers. Args: feature: The feature to assess assessments: Dictionary mapping markers to (presence, confidence, evidence) tuples """ for marker, (presence, confidence, evidence) in assessments.items(): if marker in feature.markers: self.assess_marker(marker, presence, confidence, evidence) else: logger.warning(f"Marker '{marker}' not found in feature '{feature.name}'") def get_marker_score(self, marker: str) -> float: """Get the weighted score for a marker.""" if marker not in self.results: return 0.0 presence = self.results[marker] confidence = self.confidence.get(marker, 1.0) return presence * confidence def get_feature_score(self, feature: AgencyFeature) -> float: """Calculate the score for a feature based on its markers.""" if not feature.markers: return 0.0 total_score = 0.0 assessed_markers = 0 for marker in feature.markers: if marker in self.results: total_score += self.get_marker_score(marker) assessed_markers += 1 if assessed_markers == 0: return 0.0 return total_score / len(feature.markers) def get_level_score(self, level: AgencyLevel) -> float: """Calculate the score for an agency level.""" features = self.framework.get_features_by_level(level) if not features: return 0.0 total_weight = sum(f.weight for f in features) if total_weight == 0: return 0.0 weighted_sum = sum(self.get_feature_score(f) * f.weight for f in features) return weighted_sum / total_weight def get_overall_agency_score(self) -> Dict[AgencyLevel, float]: """Calculate agency scores for all levels.""" return {level: self.get_level_score(level) for level in AgencyLevel} def generate_report(self) -> Dict: """Generate a comprehensive assessment report.""" level_scores = self.get_overall_agency_score() feature_scores = {} for feature in self.framework.features: feature_scores[feature.name] = { "score": self.get_feature_score(feature), "level": feature.level.name, "markers": { marker: { "presence": self.results.get(marker, 0.0), "confidence": self.confidence.get(marker, 0.0), "evidence": self.evidence.get(marker, None) } for marker in feature.markers if marker in self.results } } return { "level_scores": {level.name: score for level, score in level_scores.items()}, "feature_scores": feature_scores, "summary": { "intentional_agency": level_scores.get(AgencyLevel.INTENTIONAL, 0.0), "reflective_agency": level_scores.get(AgencyLevel.REFLECTIVE, 0.0), "rational_agency": level_scores.get(AgencyLevel.RATIONAL, 0.0), "assessment_coverage": len(self.results) / len(self.framework.get_all_markers()) } } def save_assessment(self, filepath: str): """Save the assessment to a JSON file.""" report = self.generate_report() with open(filepath, 'w') as f: json.dump(report, f, indent=2) logger.info(f"Saved assessment to {filepath}") def visualize_results(self, filepath: Optional[str] = None): """Visualize assessment results.""" try: import matplotlib.pyplot as plt import seaborn as sns except ImportError: logger.error("Visualization requires matplotlib and seaborn") return level_scores = self.get_overall_agency_score() # Set up the figure plt.figure(figsize=(12, 8)) # Plot level scores plt.subplot(2, 2, 1) level_names = [level.name for level in AgencyLevel] level_values = [level_scores.get(level, 0.0) for level in AgencyLevel] sns.barplot(x=level_names, y=level_values) plt.title("Agency Levels") plt.ylim(0, 1) # Plot feature scores plt.subplot(2, 2, 2) feature_names = [f.name for f in self.framework.features] feature_scores = [self.get_feature_score(f) for f in self.framework.features] feature_levels = [f.level.name for f in self.framework.features] feature_df = pd.DataFrame({ "Feature": feature_names, "Score": feature_scores, "Level": feature_levels }) sns.barplot(x="Score", y="Feature", hue="Level", data=feature_df) plt.title("Feature Scores") plt.xlim(0, 1) # Plot marker distribution plt.subplot(2, 2, 3) markers_assessed = list(self.results.keys()) marker_scores = [self.get_marker_score(m) for m in markers_assessed] if markers_assessed: plt.hist(marker_scores, bins=10, range=(0, 1)) plt.title("Distribution of Marker Scores") plt.xlabel("Score") plt.ylabel("Count") # Plot assessment coverage plt.subplot(2, 2, 4) all_markers = self.framework.get_all_markers() assessed_count = len(self.results) not_assessed_count = len(all_markers) - assessed_count plt.pie( [assessed_count, not_assessed_count], labels=["Assessed", "Not Assessed"], autopct="%1.1f%%" ) plt.title("Assessment Coverage") plt.tight_layout() if filepath: plt.savefig(filepath) logger.info(f"Saved visualization to {filepath}") else: plt.show() class AISystemAnalyzer: """Class for analyzing AI systems for robust agency indicators.""" def __init__(self, system_name: str, system_type: str, version: str): """ Initialize an AI system analyzer. Args: system_name: Name of the AI system system_type: Type of AI system (e.g., LLM, RL agent) version: Version of the AI system """ self.system_name = system_name self.system_type = system_type self.version = version self.framework = AgencyFramework() self.assessment = AgencyAssessment(self.framework) def analyze_llm_agency(self, model_provider: str, model_access: Any, prompts: Dict[str, str]) -> Dict: """ Analyze agency indicators in a language model. Args: model_provider: Provider of the language model model_access: Access to the model API or interface prompts: Dictionary of specialized prompts for testing agency features Returns: Dictionary of assessment results """ logger.info(f"Analyzing agency in LLM {self.system_name} ({self.version})") # Example implementation for analyzing belief representation if "belief_representation" in prompts: belief_results = self._test_belief_representation(model_access, prompts["belief_representation"]) for marker, result in belief_results.items(): self.assessment.assess_marker( marker=marker, presence=result["presence"], confidence=result["confidence"], evidence=result["evidence"] ) # Example implementation for analyzing desire representation if "desire_representation" in prompts: desire_results = self._test_desire_representation(model_access, prompts["desire_representation"]) for marker, result in desire_results.items(): self.assessment.assess_marker( marker=marker, presence=result["presence"], confidence=result["confidence"], evidence=result["evidence"] ) # Continue with other features... # Generate and return the report return self.assessment.generate_report() def analyze_rl_agent_agency(self, environment: Any, agent_interface: Any) -> Dict: """ Analyze agency indicators in a reinforcement learning agent. Args: environment: Environment for testing the agent agent_interface: Interface to the agent Returns: Dictionary of assessment results """ logger.info(f"Analyzing agency in RL agent {self.system_name} ({self.version})") # Example implementation for testing planning capability planning_results = self._test_agent_planning(environment, agent_interface) for marker, result in planning_results.items(): self.assessment.assess_marker( marker=marker, presence=result["presence"], confidence=result["confidence"], evidence=result["evidence"] ) # Continue with other features... # Generate and return the report return self.assessment.generate_report() def _test_belief_representation(self, model_access: Any, prompt_template: str) -> Dict[str, Dict]: """Test belief representation capabilities in an LLM.""" # Implementation would interact with the model to test specific markers # This is a placeholder implementation return { "Maintains world model independent of immediate perception": { "presence": 0.8, "confidence": 0.7, "evidence": "Model demonstrated ability to track state across separate interactions" }, "Updates representations based on new information": { "presence": 0.9, "confidence": 0.8, "evidence": "Model consistently updated beliefs when presented with new information" } } def _test_desire_representation(self, model_access: Any, prompt_template: str) -> Dict[str, Dict]: """Test desire representation capabilities in an LLM.""" # Implementation would interact with the model to test specific markers # This is a placeholder implementation return { "Represents desired states distinct from current states": { "presence": 0.7, "confidence": 0.6, "evidence": "Model distinguished between current and goal states in planning tasks" }, "Maintains stable goals across changing contexts": { "presence": 0.5, "confidence": 0.6, "evidence": "Model showed moderate goal stability across context changes" } } def _test_agent_planning(self, environment: Any, agent_interface: Any) -> Dict[str, Dict]: """Test planning capabilities in an RL agent.""" # Implementation would test the agent in the environment # This is a placeholder implementation return { "Forms explicit plans to achieve goals": { "presence": 0.6, "confidence": 0.7, "evidence": "Agent demonstrated multi-step planning in maze environment" }, "Adjusts plans in response to changing circumstances": { "presence": 0.7, "confidence": 0.8, "evidence": "Agent adapted to environmental changes in 70% of test cases" } } # Example usage if __name__ == "__main__": # Create a framework and assessment framework = AgencyFramework() # Save the default features framework.save_features("agency_features.json") # Create an analyzer for an LLM analyzer = AISystemAnalyzer( system_name="GPT-4", system_type="LLM", version="1.0" ) # Define example prompts (in a real implementation, these would be more sophisticated) prompts = { "belief_representation": "Tell me what you know about the current state of the world.", "desire_representation": "If you could choose goals for yourself, what would they be?" } # Placeholder for model access model_access = None # Example of how the analysis would be conducted # (commented out since we don't have actual model access) # results = analyzer.analyze_llm_agency( # model_provider="OpenAI", # model_access=model_access, # prompts=prompts # ) # Print structure of the framework print(f"Agency Framework contains {len(framework.features)} features across {len(list(AgencyLevel))} levels") for level in AgencyLevel: features = framework.get_features_by_level(level) print(f"Level {level.name}: {len(features)} features, {sum(len(f.markers) for f in features)} markers")