soft-analytics-01/tests/test_cleaner_clean.py

import io
import json
import os
import tarfile

import pandas as pd
import pytest

from src.cleaner.clean import clean_all, save_set


@pytest.fixture
def sample_tar_file(tmp_path):
    # Create a sample tar file for testing
    tar_file_path = os.path.join(tmp_path, "sample_issues.tar.gz")
    with tarfile.open(tar_file_path, 'w:gz') as tar:
        # Add a sample JSON file to the tar archive
        sample_issue = {
            "id": 10001,
            "node_id": "giovanni",
            "number": 1,
            "title": "Sample Issue",
            "user": {
                "login": "test_user",
                "id": 2,
            },
            "labels": [],
            "state": "open",
            "assignee": {
                "login": "sample_user",
                "id": 3,
            },
            "assignees": [
                {
                    "login": "sample_user",
                    "id": 3,
                }
            ],
            "created_at": "2022-01-01T00:00:00Z",
            "body": "This is a sample issue body.",
        }
        tarinfo = tarfile.TarInfo('sample_issue.json')
        contents: bytes = json.dumps(sample_issue).encode()
        tarinfo.size = len(contents)

        file_object = io.BytesIO(contents)
        tar.addfile(tarinfo, fileobj=file_object)

    return tar_file_path


def test_clean_all(sample_tar_file):
    objs = []
    counter = clean_all(objs, sample_tar_file)
    assert counter == 0  # No issues should be skipped

    # Assuming you have some assertions for the content of objs based on the sample data
    assert len(objs) == 1
    assert objs[0]['id'] == 1
    assert objs[0]['title'] == 'Sample Issue'
    assert objs[0]['body'] == 'This is a sample issue body.'
    assert objs[0]['state'] == 'open'
    assert objs[0]['assignee'] == 'sample_user'
    assert objs[0]['created_at'] == '2022-01-01T00:00:00Z'


def test_save_set(tmp_path):
    # Assuming you have a DataFrame (df) with some sample data
    df = pd.DataFrame({
        'title': ['Issue 1', 'Issue 2', 'Issue 3'],
        'body': ['Body 1', 'Body 2', 'Body 3'],
        'state': ['open', 'closed', 'open'],
        'assignee': ['user1', 'user2', 'user3'],
        'created_at': ['2022-01-01T00:00:00Z', '2022-01-02T00:00:00Z', '2022-01-03T00:00:00Z']
    }, index=[1, 2, 3])

    # Save the DataFrame to a CSV file using save_set
    save_set(df, 1, 3, 'test', os.path.join(tmp_path, 'test_file_'))

    # Load the saved CSV file and assert its content
    loaded_df = pd.read_csv(os.path.join(tmp_path, 'test_file_test_000001_000003.csv'), index_col=0)

    assert loaded_df.equals(df)
Final version of the bug-triaging project Commit history has been discarded to remove large files from the repo. 2024-01-03 14:20:45 +00:00			`import io`
			`import json`
			`import os`
			`import tarfile`

			`import pandas as pd`
			`import pytest`

			`from src.cleaner.clean import clean_all, save_set`


			`@pytest.fixture`
			`def sample_tar_file(tmp_path):`
			`# Create a sample tar file for testing`
			`tar_file_path = os.path.join(tmp_path, "sample_issues.tar.gz")`
			`with tarfile.open(tar_file_path, 'w:gz') as tar:`
			`# Add a sample JSON file to the tar archive`
			`sample_issue = {`
			`"id": 10001,`
			`"node_id": "giovanni",`
			`"number": 1,`
			`"title": "Sample Issue",`
			`"user": {`
			`"login": "test_user",`
			`"id": 2,`
			`},`
			`"labels": [],`
			`"state": "open",`
			`"assignee": {`
			`"login": "sample_user",`
			`"id": 3,`
			`},`
			`"assignees": [`
			`{`
			`"login": "sample_user",`
			`"id": 3,`
			`}`
			`],`
			`"created_at": "2022-01-01T00:00:00Z",`
			`"body": "This is a sample issue body.",`
			`}`
			`tarinfo = tarfile.TarInfo('sample_issue.json')`
			`contents: bytes = json.dumps(sample_issue).encode()`
			`tarinfo.size = len(contents)`

			`file_object = io.BytesIO(contents)`
			`tar.addfile(tarinfo, fileobj=file_object)`

			`return tar_file_path`


			`def test_clean_all(sample_tar_file):`
			`objs = []`
			`counter = clean_all(objs, sample_tar_file)`
			`assert counter == 0 # No issues should be skipped`

			`# Assuming you have some assertions for the content of objs based on the sample data`
			`assert len(objs) == 1`
			`assert objs[0]['id'] == 1`
			`assert objs[0]['title'] == 'Sample Issue'`
			`assert objs[0]['body'] == 'This is a sample issue body.'`
			`assert objs[0]['state'] == 'open'`
			`assert objs[0]['assignee'] == 'sample_user'`
			`assert objs[0]['created_at'] == '2022-01-01T00:00:00Z'`


			`def test_save_set(tmp_path):`
			`# Assuming you have a DataFrame (df) with some sample data`
			`df = pd.DataFrame({`
			`'title': ['Issue 1', 'Issue 2', 'Issue 3'],`
			`'body': ['Body 1', 'Body 2', 'Body 3'],`
			`'state': ['open', 'closed', 'open'],`
			`'assignee': ['user1', 'user2', 'user3'],`
			`'created_at': ['2022-01-01T00:00:00Z', '2022-01-02T00:00:00Z', '2022-01-03T00:00:00Z']`
			`}, index=[1, 2, 3])`

			`# Save the DataFrame to a CSV file using save_set`
			`save_set(df, 1, 3, 'test', os.path.join(tmp_path, 'test_file_'))`

			`# Load the saved CSV file and assert its content`
			`loaded_df = pd.read_csv(os.path.join(tmp_path, 'test_file_test_000001_000003.csv'), index_col=0)`

			`assert loaded_df.equals(df)`