Skip to content

Commit

Permalink
EES-5017 Add data set GET query endpoint functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
ntsim committed Apr 3, 2024
1 parent 5ad3eca commit ec53169
Show file tree
Hide file tree
Showing 30 changed files with 2,153 additions and 5 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
using System.Collections.Generic;

namespace GovUk.Education.ExploreEducationStatistics.Common.Validators.ErrorDetails;

/// <summary>
/// Provides details of items that could not be found.
/// </summary>
/// <param name="Items">The items that could not be found.</param>
/// <typeparam name="T">The type of each item.</typeparam>
public record NotFoundItemsErrorDetail<T>(IEnumerable<T> Items);
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using Asp.Versioning;
using GovUk.Education.ExploreEducationStatistics.Common.Extensions;
using GovUk.Education.ExploreEducationStatistics.Common.ModelBinding;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Requests;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Services.Interfaces;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Api.ViewModels;
Expand All @@ -12,7 +11,10 @@ namespace GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Controllers
[ApiVersion(1.0)]
[ApiController]
[Route("api/v{version:apiVersion}/data-sets")]
public class DataSetsController(IDataSetService dataSetService) : ControllerBase
public class DataSetsController(
IDataSetService dataSetService,
IDataSetQueryService dataSetQueryService)
: ControllerBase
{
/// <summary>
/// Get a data set’s summary
Expand Down Expand Up @@ -261,6 +263,8 @@ public async Task<ActionResult<DataSetQueryPaginatedResultsViewModel>> QueryData
[FromQuery] DataSetGetQueryRequest request,
CancellationToken cancellationToken)
{
return Ok();
return await dataSetQueryService
.Query(dataSetId, request, dataSetVersion, cancellationToken)
.HandleFailuresOrOk();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
namespace GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Model;

public record IdPublicIdPair
{
public required int Id { get; init; }

public required string PublicId { get; init; }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
using GovUk.Education.ExploreEducationStatistics.Common.Model;

namespace GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Model;

public record Sort(string Field, SortDirection Direction);
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
using GovUk.Education.ExploreEducationStatistics.Common.Model.Data;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Model;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Model;
using InterpolatedSql;

namespace GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Repository.Interfaces;

public interface IParquetDataRepository
{
Task<long> CountRows(
DataSetVersion dataSetVersion,
IInterpolatedSql where,
CancellationToken cancellationToken = default);

Task<IEnumerable<IDictionary<string, object?>>> ListRows(
DataSetVersion dataSetVersion,
IEnumerable<string> columns,
IInterpolatedSql where,
IEnumerable<Sort>? sorts = null,
int page = 1,
int pageSize = 1000,
CancellationToken cancellationToken = default);

Task<ISet<string>> ListColumns(
DataSetVersion dataSetVersion,
CancellationToken cancellationToken = default);

Task<ISet<GeographicLevel>> ListLocationLevels(
DataSetVersion dataSetVersion,
CancellationToken cancellationToken = default);

Task<ISet<string>> ListFilterColumns(
DataSetVersion dataSetVersion,
CancellationToken cancellationToken = default);

Task<ISet<string>> ListIndicatorColumns(
DataSetVersion dataSetVersion,
CancellationToken cancellationToken = default);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Model;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Model;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Model.Parquet;

namespace GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Repository.Interfaces;

public interface IParquetFilterOptionRepository
{
Task<IEnumerable<ParquetFilterOption>> List(
DataSetVersion dataSetVersion,
IEnumerable<int> ids,
CancellationToken cancellationToken = default);

Task<IEnumerable<ParquetFilterOption>> List(
DataSetVersion dataSetVersion,
IEnumerable<string> publicIds,
CancellationToken cancellationToken = default);

Task<IEnumerable<IdPublicIdPair>> ListPublicIds(
DataSetVersion dataSetVersion,
IEnumerable<int> ids,
CancellationToken cancellationToken = default);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
using GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Model;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Requests;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Model;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Model.Parquet;

namespace GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Repository.Interfaces;

public interface IParquetLocationOptionRepository
{
Task<IEnumerable<ParquetLocationOption>> List(
DataSetVersion dataSetVersion,
IEnumerable<int> ids,
CancellationToken cancellationToken = default);

Task<IEnumerable<ParquetLocationOption>> List(
DataSetVersion dataSetVersion,
IEnumerable<DataSetQueryLocation> locations,
CancellationToken cancellationToken = default);

Task<IEnumerable<IdPublicIdPair>> ListPublicIds(
DataSetVersion dataSetVersion,
IEnumerable<int> ids,
CancellationToken cancellationToken = default);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
using GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Requests;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Model;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Model.Parquet;

namespace GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Repository.Interfaces;

public interface IParquetTimePeriodRepository
{
Task<IEnumerable<ParquetTimePeriod>> List(
DataSetVersion dataSetVersion,
IEnumerable<int> ids,
CancellationToken cancellationToken = default);

Task<IEnumerable<ParquetTimePeriod>> List(
DataSetVersion dataSetVersion,
IEnumerable<DataSetQueryTimePeriod> timePeriods,
CancellationToken cancellationToken = default);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
using GovUk.Education.ExploreEducationStatistics.Common.Extensions;
using GovUk.Education.ExploreEducationStatistics.Common.Model.Data;
using GovUk.Education.ExploreEducationStatistics.Common.Utils;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Model;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Repository.Interfaces;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Model;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Model.DuckDb;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Model.Parquet;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Model.Parquet.Tables;
using GovUk.Education.ExploreEducationStatistics.Public.Data.Services.Interfaces;
using InterpolatedSql;
using InterpolatedSql.Dapper;
using StackExchange.Profiling;
using DataTable = GovUk.Education.ExploreEducationStatistics.Public.Data.Model.Parquet.Tables.DataTable;

namespace GovUk.Education.ExploreEducationStatistics.Public.Data.Api.Repository;

public class ParquetDataRepository(
IDuckDbConnection duckDbConnection,
IParquetPathResolver parquetPathResolver)
: IParquetDataRepository
{
private const string DataIdsAlias = "data_ids";

public async Task<long> CountRows(
DataSetVersion dataSetVersion,
IInterpolatedSql where,
CancellationToken cancellationToken = default)
{
using var _ = MiniProfiler.Current
.Step($"{nameof(ParquetDataRepository)}.{nameof(CountRows)}");

var command = duckDbConnection.SqlBuilder(
$"""
SELECT count(*)
FROM '{parquetPathResolver.DataPath(dataSetVersion):raw}'
"""
);

command.AppendIf(!where.IsEmpty(), $"WHERE {where}");

return await command.QuerySingleAsync<long>(cancellationToken: cancellationToken);
}

public async Task<IEnumerable<IDictionary<string, object?>>> ListRows(
DataSetVersion dataSetVersion,
IEnumerable<string> columns,
IInterpolatedSql where,
IEnumerable<Sort>? sorts = null,
int page = 1,
int pageSize = 1000,
CancellationToken cancellationToken = default)
{
using var timing = MiniProfiler.Current
.Step($"{nameof(ParquetDataRepository)}.{nameof(ListRows)}");

var dataPath = parquetPathResolver.DataPath(dataSetVersion);

var whereFragment = new DuckDbSqlBuilder()
.AppendIf(!where.IsEmpty(), $"WHERE {where}");

var orderings = (sorts ?? [])
.Select(s => $"{s.Field} {s.Direction.ToString().ToUpper()}")
.ToList();

var orderByFragment = new DuckDbSqlBuilder()
.AppendIf(orderings.Count != 0, $"ORDER BY")
.AppendRange(orderings, joinString: ",\n");

var pageOffset = (page - 1) * pageSize;

// We essentially split this query into two sub-queries:
//
// 1. The main query which is offset paginated and gathers the row ids
// 2. Another query to fetch the rows using the ids from the main query (i.e. a 'deferred' join)
//
// This 'deferred join' technique is more efficient than a single query and helps to reduce
// the performance penalty of using offset pagination having to scan through many rows.
var command = duckDbConnection.SqlBuilder(
$"""
WITH {DataIdsAlias:raw} AS (
SELECT {DataTable.Ref().Id:raw}
FROM '{dataPath:raw}' AS {DataTable.TableName:raw}
{whereFragment}
{orderByFragment}
LIMIT {pageSize}
OFFSET {pageOffset}
)
SELECT {columns.Select(DataTable.Ref().Col).JoinToString(",\n"):raw}
FROM '{dataPath:raw}' AS {DataTable.TableName:raw}
JOIN {DataIdsAlias:raw} ON {DataIdsAlias:raw}.id = {DataTable.Ref().Id:raw}
{orderByFragment}
"""
);

return (await command.QueryAsync(cancellationToken: cancellationToken))
.Cast<IDictionary<string, object?>>();
}

public async Task<ISet<string>> ListColumns(
DataSetVersion dataSetVersion,
CancellationToken cancellationToken = default)
{
var command = duckDbConnection.SqlBuilder(
$"DESCRIBE SELECT * FROM '{parquetPathResolver.DataPath(dataSetVersion):raw}' LIMIT 1");

var columns = await command.QueryAsync<ParquetColumn>(cancellationToken: cancellationToken);

return columns
.Select(col => col.ColumnName)
.ToHashSet();
}

public async Task<ISet<GeographicLevel>> ListLocationLevels(
DataSetVersion dataSetVersion,
CancellationToken cancellationToken = default)
{
var command = duckDbConnection.SqlBuilder(
$"""
SELECT DISTINCT {LocationsTable.Cols.Level:raw}
FROM '{parquetPathResolver.LocationsPath(dataSetVersion):raw}'
""");

var levels = await command.QueryAsync<string>(cancellationToken: cancellationToken);

return levels
.Select(EnumUtil.GetFromEnumValue<GeographicLevel>)
.ToHashSet();
}

public async Task<ISet<string>> ListFilterColumns(
DataSetVersion dataSetVersion,
CancellationToken cancellationToken = default)
{
var command = duckDbConnection.SqlBuilder(
$"""
SELECT DISTINCT {FiltersTable.Cols.ColumnName:raw}
FROM '{parquetPathResolver.FiltersPath(dataSetVersion):raw}'
""");

var cols = await command
.QueryAsync<string>(cancellationToken: cancellationToken);

return cols.ToHashSet();
}

public async Task<ISet<string>> ListIndicatorColumns(
DataSetVersion dataSetVersion,
CancellationToken cancellationToken = default)
{
var command = duckDbConnection.SqlBuilder(
$"""
SELECT DISTINCT {IndicatorsTable.Cols.Id:raw}
FROM '{parquetPathResolver.IndicatorsPath(dataSetVersion):raw}'
""");

var indicators = await command.QueryAsync<string>(cancellationToken: cancellationToken);

return indicators.ToHashSet();
}
}
Loading

0 comments on commit ec53169

Please sign in to comment.