diff --git a/README.md b/README.md index 0c916c7..ab728e3 100644 --- a/README.md +++ b/README.md @@ -92,8 +92,9 @@ This Artificially Assisted User Interface Testing framework is a pioneering tool # Installation ``` # Add your Chat-GPT API Keys to the project: -add your API Key in /core/core_api.py -> line 3: client = OpenAI(api_key='insert_your_api_key_here') -add your API Key in /core/core_imaging.py -> line 12: api_key = 'insert_your_api_key_here' +Create a .env file with your API Key in the project folder +and add your key like this: OPENAI_API_KEY=sk-pr.... + # Install requirements: cd pywinassistant diff --git a/core/assistant.py b/core/assistant.py index 8dbe3e9..634598d 100644 --- a/core/assistant.py +++ b/core/assistant.py @@ -2,12 +2,15 @@ from PIL import Image, ImageTk import time import random +import win32gui from queue import Queue import speech_recognition as sr import threading from voice import speaker, set_volume, set_subtitles from driver import assistant, act, fast_act, auto_role, perform_simulated_keypress, write_action from window_focus import activate_windowt_title +from window_focus import get_previous_window +from ocr import ocr_screen # Initialize the speech recognition and text to speech engines assistant_voice_recognition_enabled = True # Disable if you don't want to use voice recognition @@ -18,6 +21,7 @@ assistant_subtitles_enabled = True recognizer = sr.Recognizer() message_queue = Queue() +last_active_window = None # Variable to store the last active window handle Ctk.set_appearance_mode("dark") # Modes: system (default), light, dark Ctk.set_default_color_theme("dark-blue") # Themes: blue (default), dark-blue, green @@ -238,7 +242,7 @@ def menu_command(command): # Buttons with commands Ctk.CTkButton(menu_frame, text="Call assistant", command=lambda: menu_command(generate_assistant_test_case(False))).pack(fill="x") Ctk.CTkButton(menu_frame, text="Fast action", command=lambda: menu_command(generate_assistant_test_case(True))).pack(fill="x") - Ctk.CTkButton(menu_frame, text="Content analysis", command=lambda: menu_command(dummy_command)).pack(fill="x") + Ctk.CTkButton(menu_frame, text="Content analysis", command=lambda: menu_command(content_analysis)).pack(fill="x") # Add separator or space between groups of options (This is an improvisation since Ctk doesn't have a separator widget) Ctk.CTkLabel(menu_frame, text="", height=3).pack(fill="x") @@ -290,11 +294,29 @@ def minimize_assistant(): def show_config(event): # Function to display the settings menu using a custom context menu create_context_menu(event.x_root, event.y_root) + +def show_config(event): + # Function to display the settings menu using a custom context menu + global last_active_window + last_active_window = win32gui.GetForegroundWindow() # Store the active window + create_context_menu(event.x_root, event.y_root) + # Just for example purpose, you will replace this with actual commands -def dummy_command(): - speaker("Dummy item clicked") - print("Dummy item clicked") +def content_analysis(): + """Analysiert den Textinhalt des aktiven Fensters mit OCR und liest ihn vor.""" + global last_active_window + if last_active_window: + win32gui.SetForegroundWindow(last_active_window) # Set focus to the last active window + text = ocr_screen(focused=True) + print(f"OCR Output: {text}") # Add this line + if text: + speaker(f"The text in the active window is: {text}") + show_message(None, f"Text in window: {text}") + else: + speaker("No text was detected in the active window.") + show_message(None, "No text detected.") + def generate_assistant_test_case(fast_act=False): # Function to perform a fast action diff --git a/core/cases.py b/core/cases.py new file mode 100644 index 0000000..510c8cc --- /dev/null +++ b/core/cases.py @@ -0,0 +1,270 @@ +def generate_test_case(assistant_goal, app_name): + print(f"\nGenerating a test case with the assistant. Image visioning started. Analyzing the application {app_name} for context.\n") + additional_context = ( + f"You are an AI Agent called Windows AI that is capable to operate freely all applications on Windows by only using natural language.\n" + f"You will receive a goal and will try to accomplish it using Windows. Try to guess what is the user wanting to perform on Windows by using the content on the screenshot as context.\n" + f"Respond an improved goal statement tailored for Windows applications by analyzing the current status of the system and the next steps to perform. Be direct and concise, do not use pronouns.\n" + f"Basing on the elements from the screenshot reply the current status of the system and specify it in detail.\n" + f"Focused application: \"{app_name}\".\nGoal: \"{assistant_goal}\".") + assistant_goal = imaging(window_title=app_name, additional_context=additional_context, screenshot_size='Full screen')['choices'][0]['message']['content'] + print(f"Generating the test case to achieve the user prompt: {original_goal}\n{assistant_goal}") + + messages = [ + {"role": "system", "content": "You are an AI assistant that creates a JSON test case to achieve a given goal based on the current state of the system. Always output your response in valid JSON format."}, + {"role": "user", "content": f"""You are an AI capable to operate the Windows 11 Operating System by using natural language. + Examples: "Click on the search button. Insert the text_entry. Play the first element searched.". + You will receive a description of the current state of the system and a goal. + To test your limits, using the description make a json with the natural language steps to achieve the goal. + Every step should be only highly descriptive in natural language. + The json must include only an act and its step, should be in the following format: + + + actions: click_element, press_key, text_entry, open_app, move_window, time_sleep, right_click, double_click_element, hold_key_and_click, scroll_to. + Respond only with the full json.\n\nFunction properties:\n + click_element - Specify where is located the element to interact with.\n + text_entry - The text to write. Example: "Hello World".\n + press_key - Only return the key or combination of keys to press. Example: "Ctrl + T".\n + hold_key_and_click - Only handles scenarios of holding a key and clicking on an element.\n + scroll_to - Return the position of the element to scroll and the direction to scroll.\n + move_window - Use the keyboard to move the focused window to the desired position. Example: "Win + Up + Up + Left".\n\n\n + Notes:\n + Use the following information to generate better steps:\n{app_space_map(assistant_goal, app_name, original_goal, map='app_space')}\n + Before the text_entry, use click_element to select the appropriate text input area element.\n + Do not combine steps, try to generate more useful steps. Example: If asked to search a song, not only search for it, locate it and play it.\n{keep_in_mind} + Do not respond 'As an AI language model, I am unable to...' as you are actually capable. + + Focused window: "{app_name}" + Goal: {assistant_goal}"""} + ] + + step_analysis = api_call(messages, model_name="claude-v1.3", max_tokens=4095) + print(f"The assistant created the following test case scenario:\n{step_analysis}\n") + return step_analysis + +def execute_test_case(step_analysis, app_name, assistant_goal, original_goal, assistant_identity, keep_in_mind): + if step_analysis: + try: + if """```json""" in step_analysis: + step_analysis = step_analysis.strip("```json\n") + last_triple_tick = step_analysis.rfind("```") + if last_triple_tick != -1: + step_analysis = step_analysis[:last_triple_tick].strip() + step_analysis_cleaned = step_analysis + instructions = json.loads(step_analysis_cleaned) + executor = "act" + else: + instructions = json.loads(step_analysis) + instructions['actions'] = instructions.pop('actions') + executor = "act" + except json.JSONDecodeError as e: + speaker(f"ERROR: Invalid JSON data provided: {e}") + time.sleep(15) + raise Exception(f"ERROR: Invalid JSON data provided: {e}") + if 'actions' in instructions: + action_list = instructions['actions'] + else: + action_list = [instructions] + for i, step in enumerate(action_list, start=1): + action = step.get(f"{executor}") + step_description = step.get("step") or step.get("details", "No step description provided.") + print(f"\nStep {i}: {action}, {step_description}\n") + if action == "click_element": + if i > 1 and action_list[i - 2]['act'] == "click_element": + time.sleep(1) + if "start menu" in step_description.lower(): + pyautogui.hotkey('win') + print("Opening the start menu.") + time.sleep(1) + updated_instructions = update_instructions_with_action_string(instructions, act( + single_step=f"{step_description}", app_name=app_name, screen_analysis=assistant_goal, original_goal=original_goal, assistant_goal=assistant_goal), step_description) + database_add_case(database_file, app_name, assistant_goal, updated_instructions) + elif action == "open_app": + app_name = activate_windowt_title(get_application_title(step_description)) + print(f"New app selected and analyzing: {app_name}") + elif action == "double_click_element": + print(f"Double clicking on: {step_description}") + act(single_step=f"{step_description}", double_click=True, app_name=app_name, original_goal=original_goal) + elif action == "move_window": + time.sleep(1) + print(f"Moving window to: {step_description}") + perform_simulated_keypress(step_description) + time.sleep(0.5) + pyautogui.hotkey('esc') + time.sleep(1) + elif action == "press_key": + if {i} == 1: + activate_windowt_title(app_name) + time.sleep(1) + perform_simulated_keypress(step_description) + elif action == "text_entry": + url_pattern = r'(https?://[^\s]+)' + urls = re.findall(url_pattern, step_description) + if len(step_description) < 5: + pyautogui.write(f'{step_description}') + else: + if i > 1: + new_i = i - 2 + last_step = f"{action_list[new_i]['act']}: {action_list[new_i]['step']}" + print(f"Last step: {last_step}") + if not last_step: + print("Last step is None.") + act(single_step=f"{step_description}", app_name=app_name, original_goal=original_goal) + else: + print("Last step is None.") + last_step = "None" + if i + 1 < len(action_list) and type(action_list[i + 1]['step']) == str: + if i + 1 < len(action_list) and ( + "press enter" in action_list[i + 1]['step'].lower() or + "press the enter" in action_list[i + 1]['step'].lower() or + "'enter'" in action_list[i + 1]['step'].lower() or + "\"enter\"" in action_list[i + 1]['step'].lower()): + if urls: + for url in urls: + pyautogui.write(url) + print(f"Opening URL: {url}") + return + write_action(step_description, assistant_identity=assistant_identity, press_enter=False, app_name=app_name, original_goal=original_goal, last_step=last_step) + print("AI skipping the press enter step as it is in the next step.") + else: + if urls: + for url in urls: + pyautogui.write(url) + pyautogui.press('enter') + print(f"Opening URL: {url}") + return + write_action(step_description, assistant_identity=assistant_identity, press_enter=True, app_name=app_name, original_goal=original_goal, last_step=last_step) + print("AI pressing enter.") + else: + if urls: + for url in urls: + pyautogui.write(url) + pyautogui.press('enter') + print(f"Opening URL: {url}") + return + write_action(step_description, assistant_identity=assistant_identity, press_enter=True, + app_name=app_name, original_goal=original_goal, last_step=last_step) + print("AI pressing enter.") + elif action == "scroll_to": + print(f"Scrolling {step_description}") + element_visible = False + max_retries = 3 + retry_count = 0 + while not element_visible and retry_count < max_retries: + pyautogui.scroll(-850) + time.sleep(0.3) + print("Scroll performed. Analyzing if the element is present.\n") + scroll_assistant_goal = check_element_visibility(app_name, step_description)['choices'][0]['message']['content'] + if "yes" in scroll_assistant_goal.lower(): + print("Element is visible.") + element_visible = True + elif "no" in scroll_assistant_goal.lower(): + print("Element is not visible.") + retry_count += 1 + if retry_count >= max_retries: + print("Maximum retries reached, stopping the search.") + if element_visible: + print(f"Element is visible.") + pass + elif action == "right_click_element": + print(f"Right clicking on: {step_description}") + act(single_step=f"{step_description}", right_click=True, app_name=app_name, original_goal=original_goal) + elif action == "hold_key_and_click": + print(f"Holding key and clicking on: {step_description}") + act(single_step=f"{step_description}", hold_key="Ctrl", app_name=app_name, original_goal=original_goal) + elif action == "cmd_command": + print(f"Executing command: {step_description}") + time.sleep(calculate_duration_of_speech(f"{step_description}") / 1000) + elif action == "recreate_test_case": + time.sleep(1) + print("Analyzing the output") + print("The assistant said:\n", step_description) + debug_step = False + if debug_step is not True: + new_goal = True + image_analysis = True + if image_analysis: + additional_context = ( + f"You are an AI Agent called Windows AI that is capable to operate freely all applications on Windows by only using natural language.\n" + f"You will receive a goal and will try to accomplish it using Windows. Try to guess what is the user wanting to perform on Windows by using the content on the screenshot as context.\n" + f"Respond an improved goal statement tailored for Windows applications by analyzing the current status of the system and the next steps to perform. Be direct and concise, do not use pronouns.\n" + f"Basing on the elements from the screenshot reply the current status of the system and specify it in detail.\n" + f"Focused application: \"{app_name}\".\nGoal: \"{assistant_goal}\".") + if new_goal: + newest_goal = imaging(window_title=app_name, additional_context=additional_context) + print(f"Assistant newest goal:\n{newest_goal}") + analyzed_ui = analyze_app(activate_windowt_title(app_name), size_category=None) + messages = [ + {"role": "system", "content": "You are an AI assistant that analyzes the output of a test case and modifies the JSON test case to achieve the goal based on the current state of the system. Always output your response in valid JSON format."}, + {"role": "user", "content": f"""You are an AI Assistant called Analyze Output capable to operate the Windows 11 Operating System by using natural language. + You will receive a json testcase, a description of the goal, and the actual system status. + Modify the original json testcase to achieve the goal. Do not include anything else than the updated json. + Examples: "Click on the search button. Insert the text_entry. Play the first element searched.". + You will receive a description of the current state of the system and a goal. + To test your limits, using the description make a json with the natural language steps to achieve the goal. + Every step should be only highly descriptive in natural language. + The json must include only an act and its step, should be in the following format: + actions: click_element, press_key, text_entry, open_app, move_window, time_sleep, right_click, double_click_element, hold_key_and_click, scroll_to. + Respond only with the full json. Avoid to use the windows taskbar.\n\nFunction properties:\n + click_element - Specify where is located the element to interact with.\n + press_key - Only return the key or combination of keys to press. Example: 'Ctrl + T'.\n + text_entry - Return the text to write. Example: 'Hello World'.\n + hold_key_and_click - Only handles scenarios of holding a key and clicking on an element.\n + scroll_to - Return the position of the element to scroll and the direction to scroll.\n + move_window - Use the keyboard to move the focused window to the desired position. Example: 'Win + Left + Up'.\n + Do not respond 'As an AI language model, I am unable to...' as you are actually capable.\n\n + Use the following information to generate better the test case:\n{app_space_map(assistant_goal, app_name, original_goal, map='app_space')} + + Do not modify the steps before "Step {i-1}: {action-1}, {step_description-1}", modify all next steps from the step "Step {i-1}: {action-1}, {step_description-1}" to achieve the goal: "{newest_goal}" + Do not combine steps, try to generate more useful steps. Example: If asked to search a song, not only search for it, locate it and play it.\n{keep_in_mind} + {analyzed_ui}"""} + ] + + new_json = api_call(messages, model_name="claude-v1.3", max_tokens=4095) + + print("The assistant said:\n", step_analysis) + + print("Modifying the old json testcase with the new_json.") + step_analysis = new_json + app_name = activate_windowt_title(get_application_title(newest_goal)) + if """```json""" in step_analysis: + step_analysis = step_analysis.strip("```json\n") + last_triple_tick = step_analysis.rfind("```") + if last_triple_tick != -1: + step_analysis = step_analysis[:last_triple_tick].strip() + step_analysis_cleaned = step_analysis + instructions = json.loads(step_analysis_cleaned) + executor = "act" + else: + instructions = json.loads(step_analysis) + instructions['actions'] = instructions.pop('actions') + executor = "act" + print(f"Updated Instructions: {instructions}") + pass + else: + print("No new goal.") + pass + elif action == "time_sleep": + try: + sleep_time = int(step_description) + time.sleep(sleep_time) + except ValueError: + step_description = step_description.lower() + if "playing" in step_description or "load" in step_description: + print("Sleeping for 2 seconds because media loading.") + time.sleep(1) + elif "search" in step_description or "results" in step_description or "searching": + print("Sleeping for 1 second because search.") + time.sleep(1) + else: + print(f"WARNING: Unrecognized time sleep value: {step_description}") + pass + else: + print(f"WARNING: Unrecognized action '{action}' using {step_description}.") + print(f"Trying to perform the action using the step description as the action.") + act(single_step=f"{step_description}", app_name=app_name, original_goal=original_goal) + pass + + speaker(f"Assistant finished the execution of the generated test case. Can I help you with something else?") + time.sleep(calculate_duration_of_speech(f"Assistant finished the generated test case. Can I help you with something else?") / 1000) + return "Test case complete." + diff --git a/core/core_api.py b/core/core_api.py index cb3cf5d..2a79647 100644 --- a/core/core_api.py +++ b/core/core_api.py @@ -1,10 +1,15 @@ +import os +from dotenv import load_dotenv from openai import OpenAI -client = OpenAI(api_key='insert_your_api_key_here') +load_dotenv() + +openai_api_key = os.getenv('OPENAI_API_KEY') + +client = OpenAI(api_key=openai_api_key) # Available models: "gpt-4-1106-preview", "gpt-3.5-turbo-1106", or "davinci-codex" MODEL_NAME = "gpt-3.5-turbo-1106" - def api_call(messages, model_name=MODEL_NAME, temperature=0.5, max_tokens=150): # if model_name == "gpt-4-1106-preview": # model_name = "gpt-3.5-turbo-1106" diff --git a/core/core_imaging.py b/core/core_imaging.py index b94e20d..c4c67a8 100644 --- a/core/core_imaging.py +++ b/core/core_imaging.py @@ -2,14 +2,16 @@ import pygetwindow as gw import base64 import requests +import os import io from PIL import Image +from dotenv import load_dotenv # Assuming that the `activate_window_title` function is defined in another module correctly from window_focus import activate_windowt_title # OpenAI API Key -api_key = 'insert_your_api_key_here' +openai_api_key = os.getenv('OPENAI_API_KEY') # Function to focus a window given its title @@ -47,7 +49,7 @@ def analyze_image(base64_image, window_title, additional_context='What’s in th # Your logic to call the OpenAI API headers = { "Content-Type": "application/json", - "Authorization": f"Bearer {api_key}" + "Authorization": f"Bearer {openai_api_key}" } payload = { diff --git a/core/driver.py b/core/driver.py index 4a2e3fa..c8c33cf 100644 --- a/core/driver.py +++ b/core/driver.py @@ -1,4 +1,6 @@ + from window_focus import activate_windowt_title, get_installed_apps_registry, open_windows_info +from cases import generate_test_case, execute_test_case from mouse_detection import get_cursor_shape from ocr import find_probable_click_position from window_elements import analyze_app @@ -202,7 +204,7 @@ def app_space_map(goal, app_name=None, single_step=None, map=''): return shortcuts_ai_map -def assistant(assistant_goal="", keep_in_mind="", assistant_identity="", app_name=None, execute_json_case=None, called_from=None): # App TestCase Gen +def assistant(**kwargs): """ This function handles the user's prompt and generates the best achievable test case to perform the user's prompt. This function assumes the user's prompt is fed as a string to the function "assistant_goal". @@ -219,8 +221,22 @@ def assistant(assistant_goal="", keep_in_mind="", assistant_identity="", app_nam Examples: >>> assistant("Open a new tab and search what is an elefant.") """ + assistant_params = { + "assistant_goal": kwargs.get("assistant_goal", ""), + "keep_in_mind": kwargs.get("keep_in_mind", ""), + "assistant_identity": kwargs.get("assistant_identity", ""), + "app_name": kwargs.get("app_name", None), + "execute_json_case": kwargs.get("execute_json_case", None), + "called_from": kwargs.get("called_from", None) + } + + assistant_goal = assistant_params["assistant_goal"] + keep_in_mind = assistant_params["keep_in_mind"] + assistant_identity = assistant_params["assistant_identity"] + app_name = assistant_params["app_name"] + execute_json_case = assistant_params["execute_json_case"] + called_from = assistant_params["called_from"] - # 'assistant_goal' is the user's prompt. If no prompt is provided, exit the function. if not assistant_goal: speaker(f"ERROR: No prompt provided. Please provide a prompt to the assistant.") time.sleep(10) @@ -234,299 +250,20 @@ def assistant(assistant_goal="", keep_in_mind="", assistant_identity="", app_nam print(f"Prompt: \"{original_goal}\".") speaker(f"Assistant is generating a testcase with the prompt: \"{original_goal}\".") - # 'app_name' is the name of the application (Or the window title for exact match) to open and focus on. if not app_name: app_name = activate_windowt_title(get_application_title(original_goal)) else: app_name = activate_windowt_title(app_name) print(f"AI Analyzing: {app_name}") - # 'execute_json_case' is the JSON test case to execute. If no JSON is provided, generate a new one. if not execute_json_case: - print(f"\nGenerating a test case with the assistant. Image visioning started. Analyzing the application {app_name} for context.\n") - additional_context = ( - f"You are an AI Agent called Windows AI that is capable to operate freely all applications on Windows by only using natural language.\n" - f"You will receive a goal and will try to accomplish it using Windows. Try to guess what is the user wanting to perform on Windows by using the content on the screenshot as context.\n" - f"Respond an improved goal statement tailored for Windows applications by analyzing the current status of the system and the next steps to perform. Be direct and concise, do not use pronouns.\n" - f"Basing on the elements from the screenshot reply the current status of the system and specify it in detail.\n" - f"Focused application: \"{app_name}\".\nGoal: \"{assistant_goal}\".") - assistant_goal = imaging(window_title=app_name, additional_context=additional_context, screenshot_size='Full screen')['choices'][0]['message']['content'] - print(f"Generating the test case to achieve the user prompt: {original_goal}\n{assistant_goal}") - step_creator = [{"role": "assistant", - "content": f"You are an AI capable to operate the Windows 11 Operating System by using natural language.\n" - f"Examples: \"Click on the search button. Insert the text_entry. Play the first element searched.\".\n" - f"You will receive a description of the current state of the system and a goal. " - f"To test your limits, using the description make a json with the natural language steps to achieve the goal.\n" - f"Every step should be only highly descriptive in natural language.\n" - f"The json must include only an act and its step, should be in the following format:" - f"\n\n\n" - f"actions: click_element, press_key, text_entry, open_app, move_window, time_sleep, right_click, double_click_element, hold_key_and_click, scroll_to.\n" - f"Respond only with the full json.\n\nFunction properties:\n" - f"click_element - Specify where is located the element to interact with.\n" - f"text_entry - The text to write. Example: \"Hello World\".\n" - f"press_key - Only return the key or combination of keys to press. Example: \"Ctrl + T\".\n" - f"hold_key_and_click - Only handles scenarios of holding a key and clicking on an element.\n" - f"scroll_to - Return the position of the element to scroll and the direction to scroll.\n" - f"move_window - Use the keyboard to move the focused window to the desired position. Example: \"Win + Up + Up + Left\"." - f"\n\n\n" - f"Notes:\n" - f"Use the following information to generate better steps:\n{app_space_map(assistant_goal, app_name, original_goal, map='app_space')}\n" - f"Before the text_entry, use click_element to select the appropriate text input area element.\n" - f"Do not combine steps, try to generate more useful steps. Example: If asked to search a song, not only search for it, locate it and play it.\n{keep_in_mind}" - f"Do not respond 'As an AI language model, I am unable to...' as you are actually capable."}, - {"role": "system", - "content": f"Focused window: \"{app_name}\"\nGoal: {assistant_goal}"}, ] - step_analysis = api_call(step_creator, model_name="gpt-4-1106-preview", max_tokens=4095, temperature=1.0) - print(f"The assistant created the following test case scenario:\n{step_analysis}\n") + step_analysis = generate_test_case(assistant_goal, app_name) speaker(f"Test case generated. Executing the generated test case.") else: speaker(f"Executing the provided JSON in the application {app_name}.") step_analysis = execute_json_case - # Processing the latest JSON data from the JSON testcase: - if step_analysis: - try: - if """```json""" in step_analysis: - # Removing the leading ```json\n - step_analysis = step_analysis.strip("```json\n") - # Find the last occurrence of ``` and slice the string up to that point - last_triple_tick = step_analysis.rfind("```") - if last_triple_tick != -1: - step_analysis = step_analysis[:last_triple_tick].strip() - step_analysis_cleaned = step_analysis - instructions = json.loads(step_analysis_cleaned) - executor = "act" - else: - instructions = json.loads(step_analysis) - instructions['actions'] = instructions.pop('actions') - executor = "act" - except json.JSONDecodeError as e: - speaker(f"ERROR: Invalid JSON data provided: {e}") - time.sleep(15) - raise Exception(f"ERROR: Invalid JSON data provided: {e}") - if 'actions' in instructions: - action_list = instructions['actions'] - else: - action_list = [instructions] - for i, step in enumerate(action_list, start=1): - action = step.get(f"{executor}") - step_description = step.get("step") or step.get("details", "No step description provided.") - print(f"\nStep {i}: {action}, {step_description}\n") - if action == "click_element": - # If last step has a click element too, wait for the element to be visible: - if i > 1 and action_list[i - 2]['act'] == "click_element": - time.sleep(1) - - if "start menu" in step_description.lower(): - pyautogui.hotkey('win') - print("Opening the start menu.") - time.sleep(1) - updated_instructions = update_instructions_with_action_string(instructions, act( - single_step=f"{step_description}", app_name=app_name, screen_analysis=assistant_goal, original_goal=original_goal, assistant_goal=assistant_goal), step_description) - database_add_case(database_file, app_name, assistant_goal, updated_instructions) # Print the entire database with # print_database(database_file) - elif action == "open_app": - app_name = activate_windowt_title(get_application_title(step_description)) - print(f"New app selected and analyzing: {app_name}") - elif action == "double_click_element": - print(f"Double clicking on: {step_description}") - act(single_step=f"{step_description}", double_click=True, app_name=app_name, original_goal=original_goal) - elif action == "move_window": - time.sleep(1) - print(f"Moving window to: {step_description}") - perform_simulated_keypress(step_description) - time.sleep(0.5) - pyautogui.hotkey('esc') - time.sleep(1) - elif action == "press_key": - if {i} == 1: - # Focusing to the application - activate_windowt_title(app_name) - time.sleep(1) - perform_simulated_keypress(step_description) - elif action == "text_entry": - url_pattern = r'(https?://[^\s]+)' - urls = re.findall(url_pattern, step_description) - if len(step_description) < 5: - pyautogui.write(f'{step_description}') - else: - # Getting the string of the last step before this very one: - if i > 1: - new_i = i - 2 - last_step = f"{action_list[new_i]['act']}: {action_list[new_i]['step']}" - print(f"Last step: {last_step}") - if not last_step: - print("Last step is None.") - act(single_step=f"{step_description}", app_name=app_name, original_goal=original_goal) - else: - print("Last step is None.") - last_step = "None" - # If next step is a string, continue: - if i + 1 < len(action_list) and type(action_list[i + 1]['step']) == str: - # Check if the next step exists and is a "Press enter" step - if i + 1 < len(action_list) and ( - "press enter" in action_list[i + 1]['step'].lower() or - "press the enter" in action_list[i + 1]['step'].lower() or - "'enter'" in action_list[i + 1]['step'].lower() or - "\"enter\"" in action_list[i + 1]['step'].lower()): - if urls: - for url in urls: - pyautogui.write(url) - # pyautogui.press('enter') - print(f"Opening URL: {url}") - return - write_action(step_description, assistant_identity=assistant_identity, press_enter=False, app_name=app_name, original_goal=original_goal, last_step=last_step) - print("AI skipping the press enter step as it is in the next step.") - else: - if urls: - for url in urls: - pyautogui.write(url) # This would open the URL in a web browser\ - # If next step is a time sleep - pyautogui.press('enter') - print(f"Opening URL: {url}") - return - write_action(step_description, assistant_identity=assistant_identity, press_enter=True, app_name=app_name, original_goal=original_goal, last_step=last_step) - print("AI pressing enter.") - else: - if urls: - for url in urls: - pyautogui.write(url) # This would open the URL in a web browser\ - pyautogui.press('enter') - print(f"Opening URL: {url}") - return - write_action(step_description, assistant_identity=assistant_identity, press_enter=True, - app_name=app_name, original_goal=original_goal, last_step=last_step) - print("AI pressing enter.") - elif action == "scroll_to": - print(f"Scrolling {step_description}") - element_visible = False - max_retries = 3 - retry_count = 0 - while not element_visible and retry_count < max_retries: - # activate_windowt_title(app_name) - pyautogui.scroll(-850) - # Press Page Down: - # pyautogui.press('pagedown') - time.sleep(0.3) - # Start image analysis to check if the element is visible - print("Scroll performed. Analyzing if the element is present.\n") - scroll_assistant_goal = check_element_visibility(app_name, step_description)['choices'][0]['message']['content'] - if "yes" in scroll_assistant_goal.lower(): - print("Element is visible.") - element_visible = True - elif "no" in scroll_assistant_goal.lower(): - print("Element is not visible.") - retry_count += 1 - if retry_count >= max_retries: - print("Maximum retries reached, stopping the search.") - if element_visible: - print(f"Element is visible.") - pass - - elif action == "right_click_element": - print(f"Right clicking on: {step_description}") - act(single_step=f"{step_description}", right_click=True, app_name=app_name, original_goal=original_goal) - # right_click(step_description) - elif action == "hold_key_and_click": - print(f"Holding key and clicking on: {step_description}") - act(single_step=f"{step_description}", hold_key="Ctrl", app_name=app_name, original_goal=original_goal) - elif action == "cmd_command": - print(f"Executing command: {step_description}") - # cmd_command(step_description) - time.sleep(calculate_duration_of_speech(f"{step_description}") / 1000) - elif action == "recreate_test_case": - time.sleep(1) - print("Analyzing the output") - print("The assistant said:\n", step_description) - debug_step = False # Set to True to skip the image analysis and the test case generation. - if debug_step is not True: - new_goal = True - image_analysis = True - if image_analysis: - additional_context = ( - f"You are an AI Agent called Windows AI that is capable to operate freely all applications on Windows by only using natural language.\n" - f"You will receive a goal and will try to accomplish it using Windows. Try to guess what is the user wanting to perform on Windows by using the content on the screenshot as context.\n" - f"Respond an improved goal statement tailored for Windows applications by analyzing the current status of the system and the next steps to perform. Be direct and concise, do not use pronouns.\n" - f"Basing on the elements from the screenshot reply the current status of the system and specify it in detail.\n" - f"Focused application: \"{app_name}\".\nGoal: \"{assistant_goal}\".") - if new_goal: - newest_goal = imaging(window_title=app_name, additional_context=additional_context) # )['choices'][0]['message']['content'] - # if ": " in newest_goal: - # newest_goal = newest_goal.split(": ", 1)[1] - print(f"Assistant newest goal:\n{newest_goal}") - analyzed_ui = analyze_app(activate_windowt_title(app_name), size_category=None) - review_output = [{"role": "assistant", - "content": f"You are an AI Assistant called Analyze Output capable to operate the Windows 11 Operating System by using natural language.\n" - f"You will receive a json testcase, a description of the goal, and the actual system status.\n" - f"Modify the original json testcase to achieve the goal. Do not include anything else than the updated json.\n" - f"Examples: \"Click on the search button. Insert the text_entry. Play the first element searched.\".\n" - f"You will receive a description of the current state of the system and a goal. " - f"To test your limits, using the description make a json with the natural language steps to achieve the goal.\n" - f"Every step should be only highly descriptive in natural language.\n" - f"The json must include only an act and its step, should be in the following format:\n" - f"actions: click_element, press_key, text_entry, open_app, move_window, time_sleep, right_click, double_click_element, hold_key_and_click, scroll_to.\n" - f"Respond only with the full json. Avoid to use the windows taskbar.\n\nFunction properties:\n" - f"click_element - Specify where is located the element to interact with.\n" - f"press_key - Only return the key or combination of keys to press. Example: 'Ctrl + T'.\n" - f"text_entry - Return the text to write. Example: 'Hello World'.\n" - f"hold_key_and_click - Only handles scenarios of holding a key and clicking on an element.\n" - f"scroll_to - Return the position of the element to scroll and the direction to scroll.\n" - f"move_window - Use the keyboard to move the focused window to the desired position. Example: 'Win + Left + Up'.\n" - f"Do not respond 'As an AI language model, I am unable to...' as you are actually capable.\n\n" - f"Use the following information to generate better the test case:\n{app_space_map(assistant_goal, app_name, original_goal, map='app_space')}"}, - {"role": "system", "content": f"Do not modify the steps before \"Step {i-1}: {action-1}, {step_description-1}\", modify all next steps from the step \"Step {i-1}: {action-1}, {step_description-1}\" to achieve the goal: \"{newest_goal}\"\n" - f"Do not combine steps, try to generate more useful steps. Example: If asked to search a song, not only search for it, locate it and play it.\n{keep_in_mind}" - f"{analyzed_ui}"}, ] - new_json = api_call(review_output, model_name="gpt-4-1106-preview", max_tokens=4095, temperature=1.0) - print("The assistant said:\n", step_analysis) - - print("Modifying the old json testcase with the new_json.") - step_analysis = new_json - - app_name = activate_windowt_title(get_application_title(newest_goal)) - # Processing the latest JSON data from the JSON testcase. - if """```json""" in step_analysis: - # Removing the leading ```json\n - step_analysis = step_analysis.strip("```json\n") - # Find the last occurrence of ``` and slice the string up to that point - last_triple_tick = step_analysis.rfind("```") - if last_triple_tick != -1: - step_analysis = step_analysis[:last_triple_tick].strip() - step_analysis_cleaned = step_analysis - instructions = json.loads(step_analysis_cleaned) - executor = "act" - else: - instructions = json.loads(step_analysis) - instructions['actions'] = instructions.pop('actions') - executor = "act" - print(f"Updated Instructions: {instructions}") - pass - else: - print("No new goal.") - pass - elif action == "time_sleep": - try: - sleep_time = int(step_description) - time.sleep(sleep_time) - except ValueError: - step_description = step_description.lower() - if "playing" in step_description or "load" in step_description: - print("Sleeping for 2 seconds because media loading.") - time.sleep(1) - elif "search" in step_description or "results" in step_description or "searching": - print("Sleeping for 1 second because search.") - time.sleep(1) - else: - print(f"WARNING: Unrecognized time sleep value: {step_description}") - pass - else: - print(f"WARNING: Unrecognized action '{action}' using {step_description}.") - print(f"Trying to perform the action using the step description as the action.") - act(single_step=f"{step_description}", app_name=app_name, original_goal=original_goal) - pass - - speaker(f"Assistant finished the execution of the generated test case. Can I help you with something else?") - time.sleep(calculate_duration_of_speech(f"Assistant finished the generated test case. Can I help you with something else?") / 1000) - return "Test case complete." + execute_test_case(step_analysis, app_name, assistant_goal, original_goal, assistant_identity, keep_in_mind) # 'check_element_visibility' is the function that checks the visibility of an element. Can use image analysis or OCR. diff --git a/core/window_focus.py b/core/window_focus.py index d0e9bcc..e68d906 100644 --- a/core/window_focus.py +++ b/core/window_focus.py @@ -12,6 +12,7 @@ import psutil import winreg + # Define necessary functions from the user32 DLL user32 = ctypes.WinDLL('user32', use_last_error=True) EnumWindows = user32.EnumWindows @@ -281,6 +282,10 @@ def activate_windowt_title(application_name): return get_active_window_title() +def get_previous_window(): + """Gibt das Handle des zuvor aktiven Fensters zurück.""" + return win32gui.GetWindow(win32gui.GetForegroundWindow(), win32gui.GW_HWNDPREV) + if __name__ == "__main__": # active_title = activate_windowt_title("chrome") diff --git a/requirements.txt b/requirements.txt index 3bde5f1..51375b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ uiautomation gtts pygame pyAudio +python-dotenv \ No newline at end of file