soft-analytics-01/tests/test_cleaner_clean.py

import io
import json
import os
import tarfile

import pandas as pd
import pytest

from src.cleaner.clean import clean_all, save_set


@pytest.fixture
def sample_tar_file(tmp_path):
    # Create a sample tar file for testing
    tar_file_path = os.path.join(tmp_path, "sample_issues.tar.gz")
    with tarfile.open(tar_file_path, 'w:gz') as tar:
        # Add a sample JSON file to the tar archive
        sample_issue = {
            "id": 10001,
            "node_id": "giovanni",
            "number": 1,
            "title": "Sample Issue",
            "user": {
                "login": "test_user",
                "id": 2,
            },
            "labels": [],
            "state": "open",
            "assignee": {
                "login": "sample_user",
                "id": 3,
            },
            "assignees": [
                {
                    "login": "sample_user",
                    "id": 3,
                }
            ],
            "created_at": "2022-01-01T00:00:00Z",
            "body": "This is a sample issue body.",
        }
        tarinfo = tarfile.TarInfo('sample_issue.json')
        contents: bytes = json.dumps(sample_issue).encode()
        tarinfo.size = len(contents)

        file_object = io.BytesIO(contents)
        tar.addfile(tarinfo, fileobj=file_object)

    return tar_file_path


def test_clean_all(sample_tar_file):
    objs = []
    counter = clean_all(objs, sample_tar_file)
    assert counter == 0  # No issues should be skipped

    # Assuming you have some assertions for the content of objs based on the sample data
    assert len(objs) == 1
    assert objs[0]['id'] == 1
    assert objs[0]['title'] == 'Sample Issue'
    assert objs[0]['body'] == 'This is a sample issue body.'
    assert objs[0]['state'] == 'open'
    assert objs[0]['assignee'] == 'sample_user'
    assert objs[0]['created_at'] == '2022-01-01T00:00:00Z'


def test_save_set(tmp_path):
    # Assuming you have a DataFrame (df) with some sample data
    df = pd.DataFrame({
        'title': ['Issue 1', 'Issue 2', 'Issue 3'],
        'body': ['Body 1', 'Body 2', 'Body 3'],
        'state': ['open', 'closed', 'open'],
        'assignee': ['user1', 'user2', 'user3'],
        'created_at': ['2022-01-01T00:00:00Z', '2022-01-02T00:00:00Z', '2022-01-03T00:00:00Z']
    }, index=[1, 2, 3])

    # Save the DataFrame to a CSV file using save_set
    save_set(df, 1, 3, 'test', os.path.join(tmp_path, 'test_file_'))

    # Load the saved CSV file and assert its content
    loaded_df = pd.read_csv(os.path.join(tmp_path, 'test_file_test_000001_000003.csv'), index_col=0)

    assert loaded_df.equals(df)