import io import json import os import tarfile import pandas as pd import pytest from src.cleaner.clean import clean_all, save_set @pytest.fixture def sample_tar_file(tmp_path): # Create a sample tar file for testing tar_file_path = os.path.join(tmp_path, "sample_issues.tar.gz") with tarfile.open(tar_file_path, 'w:gz') as tar: # Add a sample JSON file to the tar archive sample_issue = { "id": 10001, "node_id": "giovanni", "number": 1, "title": "Sample Issue", "user": { "login": "test_user", "id": 2, }, "labels": [], "state": "open", "assignee": { "login": "sample_user", "id": 3, }, "assignees": [ { "login": "sample_user", "id": 3, } ], "created_at": "2022-01-01T00:00:00Z", "body": "This is a sample issue body.", } tarinfo = tarfile.TarInfo('sample_issue.json') contents: bytes = json.dumps(sample_issue).encode() tarinfo.size = len(contents) file_object = io.BytesIO(contents) tar.addfile(tarinfo, fileobj=file_object) return tar_file_path def test_clean_all(sample_tar_file): objs = [] counter = clean_all(objs, sample_tar_file) assert counter == 0 # No issues should be skipped # Assuming you have some assertions for the content of objs based on the sample data assert len(objs) == 1 assert objs[0]['id'] == 1 assert objs[0]['title'] == 'Sample Issue' assert objs[0]['body'] == 'This is a sample issue body.' assert objs[0]['state'] == 'open' assert objs[0]['assignee'] == 'sample_user' assert objs[0]['created_at'] == '2022-01-01T00:00:00Z' def test_save_set(tmp_path): # Assuming you have a DataFrame (df) with some sample data df = pd.DataFrame({ 'title': ['Issue 1', 'Issue 2', 'Issue 3'], 'body': ['Body 1', 'Body 2', 'Body 3'], 'state': ['open', 'closed', 'open'], 'assignee': ['user1', 'user2', 'user3'], 'created_at': ['2022-01-01T00:00:00Z', '2022-01-02T00:00:00Z', '2022-01-03T00:00:00Z'] }, index=[1, 2, 3]) # Save the DataFrame to a CSV file using save_set save_set(df, 1, 3, 'test', os.path.join(tmp_path, 'test_file_')) # Load the saved CSV file and assert its content loaded_df = pd.read_csv(os.path.join(tmp_path, 'test_file_test_000001_000003.csv'), index_col=0) assert loaded_df.equals(df)